File size: 5,602 Bytes
63858e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""Extractor functions to retrieve sentences by character chunks from a file

This script contains the logic that allows the user to process and filter
sentences of the original corpus. By default, this considers a minimum sentence
length, and removes newlines and multiple consecutive spaces.

Configuration for existing functionality is at the top of the file. Feel free to
add new processing and/or filter functions. The "process_line" and "filter_line"
functions contain the pipeline for processing the scripts as needed.

"""
import regex as re
import argparse
from pathlib import Path
from functools import partial
from typing import Union

MIN_LINE_LENGTH = 8 # words

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("-f", "--file", help="Path to .txt file to analyze and annotate")
    parser.add_argument("-o", "--outdir", help="Path of directory in which to store the analyzed sentences as a .pckl")


    args = parser.parse_args()
    return args

# ============================================================
#                  Helper functions
# ============================================================
# String -> String
def replace_newlines(s:str) -> str:
    return re.sub(r"\n+", r" ", s)

# String -> String
def replace_multispace(s:str) -> str:
    return re.sub(r"\s+", r" ", s)

def is_short_sentence(s:str, min_len=8) -> str:
    """Returns True if the sentence has less than `min_len` number of words"""
    return len(s.split(' ')) < min_len

def contains_char(char:str, s:str) -> str:
    return char in s

# ============================================================
#                  Compilation functions
# ============================================================

def process_line(line:str) -> str:
    """"Replaces newlines with spaces and removes multiple consecutive spaces from a chunk of file.

    Args:
        line: Chunk of text

    Returns:
        Input that has been stripped of newlines and multiple consecutive spaces.
    """
    s = replace_multispace(replace_newlines(line))
    return s

def filter_line(line:str) -> bool:
    """Returns True if the sentence passes the MIN_LINE_LENGTH configuration

    Redefine this function with desired helper functions, returning true if you want to keep the line
    """
    fails = is_short_sentence(line, MIN_LINE_LENGTH)

    return not fails

# ============================================================
#                      Main Logic
# ============================================================

def read_outcomes(chars:str) -> Union[str, None]:
    """From a chunk of characters, decide whether to return the processed characters or Nothing.

    If the input is the empty string "", raise StopIteration

    Args:
        chars: Chunk of text to process

    Returns:
        The processed chunk of text or nothing if the characters do not pass the filtering

    Raises:
        StopIteration: If the input is the empty string "", raise StopIteration
    """

    if chars == '': raise StopIteration
    line = process_line(chars)
    if filter_line(line): return line
    return None

def get_chars(n:int, f) -> Union[str, None]:
    """Extract `n` chars from opened file `f`

    Args:
        n: Number of characters to read from the opened file
        f: Opened file from the return of `open(fname)`

    Returns:
        The processed chunk of text or nothing if the characters do not pass the filtering

    Raises:
        This function does not raise any errors of its own, but can pass up the StopIteration exception
          from read_outcomes
    """
    chars = f.read(n)
    return read_outcomes(chars)

def get_line(f):
    """Given an open file, get the next line and process it. Handles 3 scenarios:

    1. StopIteration indicates the opened file has reached the end
    2. Return a processed line if it passes the filter
    3. If line does not pass the filter line, return None
    """
    line = f.readline()
    return read_outcomes(line)

def read_on(reader, f):
    """Read from an open file `f` according to the function `reader`

    Args:
        reader: A unary function of signature (f: _io.TextIOWrapper) -> str
        f: An opened file, as returned by `open(fname)`

    Yields:
        A generator that returns lines defined by `reader` until the end of the file is reached.
    """
    while True:
        try:
            line = reader(f)
        except StopIteration:
            break

        if line is not None:
            yield line


def extract_chars(infile, n=10000):
    """Extract `n` characters from a file"""
    reader = partial(get_chars, n)
    src = open(infile, 'r')
    return read_on(reader, src)
    src.close()


def extract_lines(infile):
    """Given a file, yield the processed lines from that file"""
    src = open(infile, 'r')
    return read_on(get_line, src)
    src.close()


def extract_sentences_to_file(infile, outfname:str):
    """Extract sentences from a file into a new file indicated by `outfname`."""
    out = open(outfname, 'x')

    linegen = extract_lines(infile)

    for line in linegen:
        out.write(line + "\n")

    out.close()

def main(infile, outdir):
    """Main function for creating the outdir and saving the processed sentences to that file"""
    outfname = Path(infile).stem + '.txt'
    outdir = Path(outdir)
    outdir.mkdir(parents=True, exist_ok=True)
    outfile = outdir / outfname
    out_path = extract_sentences_to_file(infile, outfile)

    return out_path

if __name__ == "__main__":
    args = parse_args()
    main(args.file, args.outdir)