File size: 1,714 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# Natural Language Toolkit: Language Model Unit Tests
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Ilia Kurenkov <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
from functools import partial
from itertools import chain

from nltk.util import everygrams, pad_sequence

flatten = chain.from_iterable
pad_both_ends = partial(
    pad_sequence,
    pad_left=True,
    left_pad_symbol="<s>",
    pad_right=True,
    right_pad_symbol="</s>",
)
pad_both_ends.__doc__ = """Pads both ends of a sentence to length specified by ngram order.



    Following convention <s> pads the start of sentence </s> pads its end.

    """


def padded_everygrams(order, sentence):
    """Helper with some useful defaults.



    Applies pad_both_ends to sentence and follows it up with everygrams.

    """
    return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order)


def padded_everygram_pipeline(order, text):
    """Default preprocessing for a sequence of sentences.



    Creates two iterators:



    - sentences padded and turned into sequences of `nltk.util.everygrams`

    - sentences padded as above and chained together for a flat stream of words



    :param order: Largest ngram length produced by `everygrams`.

    :param text: Text to iterate over. Expected to be an iterable of sentences.

    :type text: Iterable[Iterable[str]]

    :return: iterator over text as ngrams, iterator over text as vocabulary data

    """
    padding_fn = partial(pad_both_ends, n=order)
    return (
        (everygrams(list(padding_fn(sent)), max_len=order) for sent in text),
        flatten(map(padding_fn, text)),
    )