File size: 26,105 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
# CHILDES XML Corpus Reader

# Copyright (C) 2001-2023 NLTK Project
# Author: Tomonori Nagano <[email protected]>
#         Alexis Dimitriadis <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

Corpus reader for the XML version of the CHILDES corpus.

"""

__docformat__ = "epytext en"

import re
from collections import defaultdict

from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader
from nltk.util import LazyConcatenation, LazyMap, flatten

# to resolve the namespace issue
NS = "http://www.talkbank.org/ns/talkbank"


class CHILDESCorpusReader(XMLCorpusReader):
    """

    Corpus reader for the XML version of the CHILDES corpus.

    The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML

    version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.

    Copy the needed parts of the CHILDES XML corpus into the NLTK data directory

    (``nltk_data/corpora/CHILDES/``).



    For access to the file text use the usual nltk functions,

    ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.

    """

    def __init__(self, root, fileids, lazy=True):
        XMLCorpusReader.__init__(self, root, fileids)
        self._lazy = lazy

    def words(

        self,

        fileids=None,

        speaker="ALL",

        stem=False,

        relation=False,

        strip_space=True,

        replace=False,

    ):
        """

        :return: the given file(s) as a list of words

        :rtype: list(str)



        :param speaker: If specified, select specific speaker(s) defined

            in the corpus. Default is 'ALL' (all participants). Common choices

            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude

            researchers)

        :param stem: If true, then use word stems instead of word strings.

        :param relation: If true, then return tuples of (stem, index,

            dependent_index)

        :param strip_space: If true, then strip trailing spaces from word

            tokens. Otherwise, leave the spaces on the tokens.

        :param replace: If true, then use the replaced (intended) word instead

            of the original word (e.g., 'wat' will be replaced with 'watch')

        """
        sent = None
        pos = False
        if not self._lazy:
            return [
                self._get_words(
                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
                )
                for fileid in self.abspaths(fileids)
            ]

        get_words = lambda fileid: self._get_words(
            fileid, speaker, sent, stem, relation, pos, strip_space, replace
        )
        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

    def tagged_words(

        self,

        fileids=None,

        speaker="ALL",

        stem=False,

        relation=False,

        strip_space=True,

        replace=False,

    ):
        """

        :return: the given file(s) as a list of tagged

            words and punctuation symbols, encoded as tuples

            ``(word,tag)``.

        :rtype: list(tuple(str,str))



        :param speaker: If specified, select specific speaker(s) defined

            in the corpus. Default is 'ALL' (all participants). Common choices

            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude

            researchers)

        :param stem: If true, then use word stems instead of word strings.

        :param relation: If true, then return tuples of (stem, index,

            dependent_index)

        :param strip_space: If true, then strip trailing spaces from word

            tokens. Otherwise, leave the spaces on the tokens.

        :param replace: If true, then use the replaced (intended) word instead

            of the original word (e.g., 'wat' will be replaced with 'watch')

        """
        sent = None
        pos = True
        if not self._lazy:
            return [
                self._get_words(
                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
                )
                for fileid in self.abspaths(fileids)
            ]

        get_words = lambda fileid: self._get_words(
            fileid, speaker, sent, stem, relation, pos, strip_space, replace
        )
        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

    def sents(

        self,

        fileids=None,

        speaker="ALL",

        stem=False,

        relation=None,

        strip_space=True,

        replace=False,

    ):
        """

        :return: the given file(s) as a list of sentences or utterances, each

            encoded as a list of word strings.

        :rtype: list(list(str))



        :param speaker: If specified, select specific speaker(s) defined

            in the corpus. Default is 'ALL' (all participants). Common choices

            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude

            researchers)

        :param stem: If true, then use word stems instead of word strings.

        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.

            If there is manually-annotated relation info, it will return

            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``

        :param strip_space: If true, then strip trailing spaces from word

            tokens. Otherwise, leave the spaces on the tokens.

        :param replace: If true, then use the replaced (intended) word instead

            of the original word (e.g., 'wat' will be replaced with 'watch')

        """
        sent = True
        pos = False
        if not self._lazy:
            return [
                self._get_words(
                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
                )
                for fileid in self.abspaths(fileids)
            ]

        get_words = lambda fileid: self._get_words(
            fileid, speaker, sent, stem, relation, pos, strip_space, replace
        )
        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

    def tagged_sents(

        self,

        fileids=None,

        speaker="ALL",

        stem=False,

        relation=None,

        strip_space=True,

        replace=False,

    ):
        """

        :return: the given file(s) as a list of

            sentences, each encoded as a list of ``(word,tag)`` tuples.

        :rtype: list(list(tuple(str,str)))



        :param speaker: If specified, select specific speaker(s) defined

            in the corpus. Default is 'ALL' (all participants). Common choices

            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude

            researchers)

        :param stem: If true, then use word stems instead of word strings.

        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.

            If there is manually-annotated relation info, it will return

            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``

        :param strip_space: If true, then strip trailing spaces from word

            tokens. Otherwise, leave the spaces on the tokens.

        :param replace: If true, then use the replaced (intended) word instead

            of the original word (e.g., 'wat' will be replaced with 'watch')

        """
        sent = True
        pos = True
        if not self._lazy:
            return [
                self._get_words(
                    fileid, speaker, sent, stem, relation, pos, strip_space, replace
                )
                for fileid in self.abspaths(fileids)
            ]

        get_words = lambda fileid: self._get_words(
            fileid, speaker, sent, stem, relation, pos, strip_space, replace
        )
        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

    def corpus(self, fileids=None):
        """

        :return: the given file(s) as a dict of ``(corpus_property_key, value)``

        :rtype: list(dict)

        """
        if not self._lazy:
            return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
        return LazyMap(self._get_corpus, self.abspaths(fileids))

    def _get_corpus(self, fileid):
        results = dict()
        xmldoc = ElementTree.parse(fileid).getroot()
        for key, value in xmldoc.items():
            results[key] = value
        return results

    def participants(self, fileids=None):
        """

        :return: the given file(s) as a dict of

            ``(participant_property_key, value)``

        :rtype: list(dict)

        """
        if not self._lazy:
            return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
        return LazyMap(self._get_participants, self.abspaths(fileids))

    def _get_participants(self, fileid):
        # multidimensional dicts
        def dictOfDicts():
            return defaultdict(dictOfDicts)

        xmldoc = ElementTree.parse(fileid).getroot()
        # getting participants' data
        pat = dictOfDicts()
        for participant in xmldoc.findall(
            f".//{{{NS}}}Participants/{{{NS}}}participant"
        ):
            for (key, value) in participant.items():
                pat[participant.get("id")][key] = value
        return pat

    def age(self, fileids=None, speaker="CHI", month=False):
        """

        :return: the given file(s) as string or int

        :rtype: list or int



        :param month: If true, return months instead of year-month-date

        """
        if not self._lazy:
            return [
                self._get_age(fileid, speaker, month)
                for fileid in self.abspaths(fileids)
            ]
        get_age = lambda fileid: self._get_age(fileid, speaker, month)
        return LazyMap(get_age, self.abspaths(fileids))

    def _get_age(self, fileid, speaker, month):
        xmldoc = ElementTree.parse(fileid).getroot()
        for pat in xmldoc.findall(f".//{{{NS}}}Participants/{{{NS}}}participant"):
            try:
                if pat.get("id") == speaker:
                    age = pat.get("age")
                    if month:
                        age = self.convert_age(age)
                    return age
            # some files don't have age data
            except (TypeError, AttributeError) as e:
                return None

    def convert_age(self, age_year):
        "Caclculate age in months from a string in CHILDES format"
        m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
        age_month = int(m.group(1)) * 12 + int(m.group(2))
        try:
            if int(m.group(3)) > 15:
                age_month += 1
        # some corpora don't have age information?
        except ValueError as e:
            pass
        return age_month

    def MLU(self, fileids=None, speaker="CHI"):
        """

        :return: the given file(s) as a floating number

        :rtype: list(float)

        """
        if not self._lazy:
            return [
                self._getMLU(fileid, speaker=speaker)
                for fileid in self.abspaths(fileids)
            ]
        get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
        return LazyMap(get_MLU, self.abspaths(fileids))

    def _getMLU(self, fileid, speaker):
        sents = self._get_words(
            fileid,
            speaker=speaker,
            sent=True,
            stem=True,
            relation=False,
            pos=True,
            strip_space=True,
            replace=True,
        )
        results = []
        lastSent = []
        numFillers = 0
        sentDiscount = 0
        for sent in sents:
            posList = [pos for (word, pos) in sent]
            # if any part of the sentence is intelligible
            if any(pos == "unk" for pos in posList):
                continue
            # if the sentence is null
            elif sent == []:
                continue
            # if the sentence is the same as the last sent
            elif sent == lastSent:
                continue
            else:
                results.append([word for (word, pos) in sent])
                # count number of fillers
                if len({"co", None}.intersection(posList)) > 0:
                    numFillers += posList.count("co")
                    numFillers += posList.count(None)
                    sentDiscount += 1
            lastSent = sent
        try:
            thisWordList = flatten(results)
            # count number of morphemes
            # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
            numWords = (
                len(flatten([word.split("-") for word in thisWordList])) - numFillers
            )
            numSents = len(results) - sentDiscount
            mlu = numWords / numSents
        except ZeroDivisionError:
            mlu = 0
        # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
        return mlu

    def _get_words(

        self, fileid, speaker, sent, stem, relation, pos, strip_space, replace

    ):
        if (
            isinstance(speaker, str) and speaker != "ALL"
        ):  # ensure we have a list of speakers
            speaker = [speaker]
        xmldoc = ElementTree.parse(fileid).getroot()
        # processing each xml doc
        results = []
        for xmlsent in xmldoc.findall(".//{%s}u" % NS):
            sents = []
            # select speakers
            if speaker == "ALL" or xmlsent.get("who") in speaker:
                for xmlword in xmlsent.findall(".//{%s}w" % NS):
                    infl = None
                    suffixStem = None
                    suffixTag = None
                    # getting replaced words
                    if replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}replacement"):
                        xmlword = xmlsent.find(
                            f".//{{{NS}}}w/{{{NS}}}replacement/{{{NS}}}w"
                        )
                    elif replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk"):
                        xmlword = xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk")
                    # get text
                    if xmlword.text:
                        word = xmlword.text
                    else:
                        word = ""
                    # strip tailing space
                    if strip_space:
                        word = word.strip()
                    # stem
                    if relation or stem:
                        try:
                            xmlstem = xmlword.find(".//{%s}stem" % NS)
                            word = xmlstem.text
                        except AttributeError as e:
                            pass
                        # if there is an inflection
                        try:
                            xmlinfl = xmlword.find(
                                f".//{{{NS}}}mor/{{{NS}}}mw/{{{NS}}}mk"
                            )
                            word += "-" + xmlinfl.text
                        except:
                            pass
                        # if there is a suffix
                        try:
                            xmlsuffix = xmlword.find(
                                ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
                                % (NS, NS, NS, NS)
                            )
                            suffixStem = xmlsuffix.text
                        except AttributeError:
                            suffixStem = ""
                        if suffixStem:
                            word += "~" + suffixStem
                    # pos
                    if relation or pos:
                        try:
                            xmlpos = xmlword.findall(".//{%s}c" % NS)
                            xmlpos2 = xmlword.findall(".//{%s}s" % NS)
                            if xmlpos2 != []:
                                tag = xmlpos[0].text + ":" + xmlpos2[0].text
                            else:
                                tag = xmlpos[0].text
                        except (AttributeError, IndexError) as e:
                            tag = ""
                        try:
                            xmlsuffixpos = xmlword.findall(
                                ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
                                % (NS, NS, NS, NS, NS)
                            )
                            xmlsuffixpos2 = xmlword.findall(
                                ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
                                % (NS, NS, NS, NS, NS)
                            )
                            if xmlsuffixpos2:
                                suffixTag = (
                                    xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
                                )
                            else:
                                suffixTag = xmlsuffixpos[0].text
                        except:
                            pass
                        if suffixTag:
                            tag += "~" + suffixTag
                        word = (word, tag)
                    # relational
                    # the gold standard is stored in
                    # <mor></mor><mor type="trn"><gra type="grt">
                    if relation == True:
                        for xmlstem_rel in xmlword.findall(
                            f".//{{{NS}}}mor/{{{NS}}}gra"
                        ):
                            if not xmlstem_rel.get("type") == "grt":
                                word = (
                                    word[0],
                                    word[1],
                                    xmlstem_rel.get("index")
                                    + "|"
                                    + xmlstem_rel.get("head")
                                    + "|"
                                    + xmlstem_rel.get("relation"),
                                )
                            else:
                                word = (
                                    word[0],
                                    word[1],
                                    word[2],
                                    word[0],
                                    word[1],
                                    xmlstem_rel.get("index")
                                    + "|"
                                    + xmlstem_rel.get("head")
                                    + "|"
                                    + xmlstem_rel.get("relation"),
                                )
                        try:
                            for xmlpost_rel in xmlword.findall(
                                f".//{{{NS}}}mor/{{{NS}}}mor-post/{{{NS}}}gra"
                            ):
                                if not xmlpost_rel.get("type") == "grt":
                                    suffixStem = (
                                        suffixStem[0],
                                        suffixStem[1],
                                        xmlpost_rel.get("index")
                                        + "|"
                                        + xmlpost_rel.get("head")
                                        + "|"
                                        + xmlpost_rel.get("relation"),
                                    )
                                else:
                                    suffixStem = (
                                        suffixStem[0],
                                        suffixStem[1],
                                        suffixStem[2],
                                        suffixStem[0],
                                        suffixStem[1],
                                        xmlpost_rel.get("index")
                                        + "|"
                                        + xmlpost_rel.get("head")
                                        + "|"
                                        + xmlpost_rel.get("relation"),
                                    )
                        except:
                            pass
                    sents.append(word)
                if sent or relation:
                    results.append(sents)
                else:
                    results.extend(sents)
        return LazyMap(lambda x: x, results)

    # Ready-to-use browser opener

    """

    The base URL for viewing files on the childes website. This

    shouldn't need to be changed, unless CHILDES changes the configuration

    of their server or unless the user sets up their own corpus webserver.

    """
    childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="

    def webview_file(self, fileid, urlbase=None):
        """Map a corpus file to its web version on the CHILDES website,

        and open it in a web browser.



        The complete URL to be used is:

            childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')



        If no urlbase is passed, we try to calculate it.  This

        requires that the childes corpus was set up to mirror the

        folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:

        nltk_data/corpora/childes/Eng-USA/Cornell/??? or

        nltk_data/corpora/childes/Romance/Spanish/Aguirre/???



        The function first looks (as a special case) if "Eng-USA" is

        on the path consisting of <corpus root>+fileid; then if

        "childes", possibly followed by "data-xml", appears. If neither

        one is found, we use the unmodified fileid and hope for the best.

        If this is not right, specify urlbase explicitly, e.g., if the

        corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.

        """

        import webbrowser

        if urlbase:
            path = urlbase + "/" + fileid
        else:
            full = self.root + "/" + fileid
            full = re.sub(r"\\", "/", full)
            if "/childes/" in full.lower():
                # Discard /data-xml/ if present
                path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
            elif "eng-usa" in full.lower():
                path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
            else:
                path = fileid

        # Strip ".xml" and add ".cha", as necessary:
        if path.endswith(".xml"):
            path = path[:-4]

        if not path.endswith(".cha"):
            path = path + ".cha"

        url = self.childes_url_base + path

        webbrowser.open_new_tab(url)
        print("Opening in browser:", url)
        # Pausing is a good idea, but it's up to the user...
        # raw_input("Hit Return to continue")


def demo(corpus_root=None):
    """

    The CHILDES corpus should be manually downloaded and saved

    to ``[NLTK_Data_Dir]/corpora/childes/``

    """
    if not corpus_root:
        from nltk.data import find

        corpus_root = find("corpora/childes/data-xml/Eng-USA/")

    try:
        childes = CHILDESCorpusReader(corpus_root, ".*.xml")
        # describe all corpus
        for file in childes.fileids()[:5]:
            corpus = ""
            corpus_id = ""
            for (key, value) in childes.corpus(file)[0].items():
                if key == "Corpus":
                    corpus = value
                if key == "Id":
                    corpus_id = value
            print("Reading", corpus, corpus_id, " .....")
            print("words:", childes.words(file)[:7], "...")
            print(
                "words with replaced words:",
                childes.words(file, replace=True)[:7],
                " ...",
            )
            print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
            print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
            print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
            print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
            print(
                "words with relations and pos-tag:",
                childes.words(file, relation=True)[:5],
                " ...",
            )
            print("sentence:", childes.sents(file)[:2], " ...")
            for (participant, values) in childes.participants(file)[0].items():
                for (key, value) in values.items():
                    print("\tparticipant", participant, key, ":", value)
            print("num of sent:", len(childes.sents(file)))
            print("num of morphemes:", len(childes.words(file, stem=True)))
            print("age:", childes.age(file))
            print("age in month:", childes.age(file, month=True))
            print("MLU:", childes.MLU(file))
            print()

    except LookupError as e:
        print(
            """The CHILDES corpus, or the parts you need, should be manually

        downloaded from https://childes.talkbank.org/data-xml/ and saved at

        [NLTK_Data_Dir]/corpora/childes/

            Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:

        demo('/path/to/childes/data-xml/Eng-USA/")

        """
        )
        # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
        # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
        ##this fails
        # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())


if __name__ == "__main__":
    demo()