File size: 2,051 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
.. Copyright (C) 2001-2023 NLTK Project
.. For license information, see LICENSE.TXT

    >>> import os.path

    >>> from nltk.corpus.reader import BNCCorpusReader
    >>> import nltk.test

    >>> root = os.path.dirname(nltk.test.__file__)
    >>> bnc = BNCCorpusReader(root=root, fileids='FX8.xml')

Checking the word access.
-------------------------

    >>> len(bnc.words())
    151

    >>> bnc.words()[:6]
    ['Ah', 'there', 'we', 'are', ',', '.']
    >>> bnc.words(stem=True)[:6]
    ['ah', 'there', 'we', 'be', ',', '.']

    >>> bnc.tagged_words()[:6]
    [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]

    >>> bnc.tagged_words(c5=True)[:6]
    [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]

Testing access to the sentences.
--------------------------------

    >>> len(bnc.sents())
    15

    >>> bnc.sents()[0]
    ['Ah', 'there', 'we', 'are', ',', '.']
    >>> bnc.sents(stem=True)[0]
    ['ah', 'there', 'we', 'be', ',', '.']

    >>> bnc.tagged_sents()[0]
    [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
    >>> bnc.tagged_sents(c5=True)[0]
    [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]

A not lazy loader.
------------------

    >>> eager = BNCCorpusReader(root=root, fileids=r'FX8.xml', lazy=False)

    >>> len(eager.words())
    151
    >>> eager.words(stem=True)[6:17]
    ['right', 'abdominal', 'wound', ',', 'she', 'be', 'a', 'wee', 'bit', 'confuse', '.']

    >>> eager.tagged_words()[6:11]
    [('Right', 'ADV'), ('abdominal', 'ADJ'), ('wound', 'SUBST'), (',', 'PUN'), ('she', 'PRON')]
    >>> eager.tagged_words(c5=True)[6:17]
    [('Right', 'AV0'), ('abdominal', 'AJ0'), ('wound', 'NN1'), (',', 'PUN'), ('she', 'PNP'), ("'s", 'VBZ'), ('a', 'AT0'), ('wee', 'AJ0-NN1'), ('bit', 'NN1'), ('confused', 'VVN-AJ0'), ('.', 'PUN')]
    >>> len(eager.sents())
    15