Spaces:
Runtime error
Runtime error
File size: 5,602 Bytes
63858e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
"""Extractor functions to retrieve sentences by character chunks from a file
This script contains the logic that allows the user to process and filter
sentences of the original corpus. By default, this considers a minimum sentence
length, and removes newlines and multiple consecutive spaces.
Configuration for existing functionality is at the top of the file. Feel free to
add new processing and/or filter functions. The "process_line" and "filter_line"
functions contain the pipeline for processing the scripts as needed.
"""
import regex as re
import argparse
from pathlib import Path
from functools import partial
from typing import Union
MIN_LINE_LENGTH = 8 # words
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file", help="Path to .txt file to analyze and annotate")
parser.add_argument("-o", "--outdir", help="Path of directory in which to store the analyzed sentences as a .pckl")
args = parser.parse_args()
return args
# ============================================================
# Helper functions
# ============================================================
# String -> String
def replace_newlines(s:str) -> str:
return re.sub(r"\n+", r" ", s)
# String -> String
def replace_multispace(s:str) -> str:
return re.sub(r"\s+", r" ", s)
def is_short_sentence(s:str, min_len=8) -> str:
"""Returns True if the sentence has less than `min_len` number of words"""
return len(s.split(' ')) < min_len
def contains_char(char:str, s:str) -> str:
return char in s
# ============================================================
# Compilation functions
# ============================================================
def process_line(line:str) -> str:
""""Replaces newlines with spaces and removes multiple consecutive spaces from a chunk of file.
Args:
line: Chunk of text
Returns:
Input that has been stripped of newlines and multiple consecutive spaces.
"""
s = replace_multispace(replace_newlines(line))
return s
def filter_line(line:str) -> bool:
"""Returns True if the sentence passes the MIN_LINE_LENGTH configuration
Redefine this function with desired helper functions, returning true if you want to keep the line
"""
fails = is_short_sentence(line, MIN_LINE_LENGTH)
return not fails
# ============================================================
# Main Logic
# ============================================================
def read_outcomes(chars:str) -> Union[str, None]:
"""From a chunk of characters, decide whether to return the processed characters or Nothing.
If the input is the empty string "", raise StopIteration
Args:
chars: Chunk of text to process
Returns:
The processed chunk of text or nothing if the characters do not pass the filtering
Raises:
StopIteration: If the input is the empty string "", raise StopIteration
"""
if chars == '': raise StopIteration
line = process_line(chars)
if filter_line(line): return line
return None
def get_chars(n:int, f) -> Union[str, None]:
"""Extract `n` chars from opened file `f`
Args:
n: Number of characters to read from the opened file
f: Opened file from the return of `open(fname)`
Returns:
The processed chunk of text or nothing if the characters do not pass the filtering
Raises:
This function does not raise any errors of its own, but can pass up the StopIteration exception
from read_outcomes
"""
chars = f.read(n)
return read_outcomes(chars)
def get_line(f):
"""Given an open file, get the next line and process it. Handles 3 scenarios:
1. StopIteration indicates the opened file has reached the end
2. Return a processed line if it passes the filter
3. If line does not pass the filter line, return None
"""
line = f.readline()
return read_outcomes(line)
def read_on(reader, f):
"""Read from an open file `f` according to the function `reader`
Args:
reader: A unary function of signature (f: _io.TextIOWrapper) -> str
f: An opened file, as returned by `open(fname)`
Yields:
A generator that returns lines defined by `reader` until the end of the file is reached.
"""
while True:
try:
line = reader(f)
except StopIteration:
break
if line is not None:
yield line
def extract_chars(infile, n=10000):
"""Extract `n` characters from a file"""
reader = partial(get_chars, n)
src = open(infile, 'r')
return read_on(reader, src)
src.close()
def extract_lines(infile):
"""Given a file, yield the processed lines from that file"""
src = open(infile, 'r')
return read_on(get_line, src)
src.close()
def extract_sentences_to_file(infile, outfname:str):
"""Extract sentences from a file into a new file indicated by `outfname`."""
out = open(outfname, 'x')
linegen = extract_lines(infile)
for line in linegen:
out.write(line + "\n")
out.close()
def main(infile, outdir):
"""Main function for creating the outdir and saving the processed sentences to that file"""
outfname = Path(infile).stem + '.txt'
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
outfile = outdir / outfname
out_path = extract_sentences_to_file(infile, outfile)
return out_path
if __name__ == "__main__":
args = parse_args()
main(args.file, args.outdir)
|