import re, glob, string |
import math |
from tqdm import tqdm |
from transformers import AutoTokenizer |
import torch |
tokenizer = AutoTokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') |
from nltk.tokenize import sent_tokenize |
def sanitize(line): |
line2 = re.sub(r'\[.+\]','',line) |
for a in ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]: |
line2 = line2.replace(a,'') |
line2 = re.sub(r'\b[A-Z]+\b','',line2.strip()) |
line2 = re.sub(r'\d','',line2) |
line2 = line2.translate(str.maketrans('','',"‟“’❝❞‚‘‛❛❜❟â")) |
line2 = line2.translate(str.maketrans('','',string.punctuation)) |
line2 = re.sub(r'\s+',' ',line2).strip() |
return line2 |
def remove_footnotes_and_clean(sents): |
sents = [x.replace("'",'').replace('*','').replace('’®','').replace('’','') for x in sents] |
s = '' |
for line in sents: |
try: |
if line.strip()[-1] != '-': |
s = s + line.strip() + ' ' |
else: |
s = s + line.strip() |
except: |
print(sents) |
input() |
s = re.sub(r'\s+',' ',s) |
return s |
path = 'text_files/' |
ml = sorted(glob.glob(path+'*.txt')) |
show = False |
path = 'clean_text_files/' |
for k,m in enumerate(tqdm(ml, total=len(ml), ncols=100)): |
file = open(m,'r') |
content = file.readlines() |
file.close() |
if show: |
print(m) |
paras = [] |
sents = [] |
mean_spaces = [] |
footnote_found = False |
for line in content: |
line2 = sanitize(line) |
if re.search(r'^\W\s\w',line.strip()): |
footnote_found = True |
if re.search(r'^VOL.*\d\d\d\d.*\d$',line.strip()) or 'THE COLLECTED WORKS OF MAHATMA GANDHI' in line.strip(): |
footnote_found = False |
if len(line2) > 5 and len(line2.split()) > 4 and footnote_found==False: |
if show: |
print(line.rstrip(),end='') |
li_spaces = len(line) - len(line.strip()) |
if show: |
print(li_spaces) |
mean_spaces.append(li_spaces) |
mean_spaces = math.floor(sum(mean_spaces)/len(mean_spaces)) |
if show: |
print('ms',mean_spaces) |
print(' '*mean_spaces+'^') |
footnote_found = False |
last_spaces = -1 |
i = 0 |
while i < len(content)-1: |
line = content[i] |
li_spaces = len(line) - len(line.strip()) |
if re.search(r'^\W\s\w',line.strip()): |
footnote_found = True |
if re.search(r'^VOL.*\d\d\d\d.*\d$',line.strip()) or 'THE COLLECTED WORKS OF MAHATMA GANDHI' in line.strip(): |
footnote_found = False |
i+=1 |
continue |
if footnote_found == False: |
if not (li_spaces > mean_spaces): |
line2 = sanitize(line) |
if len(line2) > 5 and len(line2.split()) > 4: |
if show: |
print('++',line.rstrip()) |
sents.append(line) |
last_spaces = li_spaces |
elif last_spaces == li_spaces: |
if show: |
print('++',line.rstrip()) |
sents.append(line) |
else: |
last_spaces = -1 |
if show: |
print('--',line.rstrip()) |
else: |
next_line = content[i+1] |
lj_spaces = len(next_line) - len(next_line.strip()) |
if not (lj_spaces > mean_spaces): |
line1 = sanitize(content[i]) |
line2 = sanitize(next_line) |
if len(line1) > 5 and len(line1.split()) > 4 and len(line2) > 5 and len(line2.split()) > 4: |
sent_text = remove_footnotes_and_clean(sents) |
paras.append(sent_text) |
if show: |
print('++',line.rstrip(),'<------NEW PARA') |
sents = [line] |
else: |
last_spaces = -1 |
if show: |
print('--',line.rstrip()) |
else: |
last_spaces = -1 |
if show: |
print('--',line.rstrip()) |
else: |
last_spaces = -1 |
if show: |
print('--',line.rstrip()) |
if show: |
input('wait') |
i+=1 |
file = open(path+m.split('/')[-1],'w') |
file.write('\n'.join(paras[1:])) |
file.close() |
path = 'clean_text_files/' |
ml = sorted(glob.glob(path+'*.txt')) |
text = [] |
for m in tqdm(range(1,99)): |
file = open(path+str(m)+'.txt','r') |
text += file.readlines() |
file.close() |
file = open('all_paras.txt','w') |
file.write(''.join(text)) |
file.close() |
sents = [] |
tcsents = [] |
para_stack = [] |
for para in tqdm(text): |
para = para.strip() |
sents += sent_tokenize(para) |
para_stack = [para] + para_stack |
while len(para_stack)!=0: |
top_para = para_stack.pop(0) |
if len(tokenizer('<|startoftext|>'+ top_para + '<|endoftext|>')['input_ids']) > 200: |
ts = sent_tokenize(top_para) |
if len(ts) > 1: |
para_stack = [' '.join(ts[int(len(ts)/2):])] + para_stack |
para_stack = [' '.join(ts[:int(len(ts)/2)])] + para_stack |
else: |
tcsents.append(top_para) |
else: |
tcsents.append(top_para) |
file = open('all_sents.txt','w') |
file.write('\n'.join(sents)) |
file.close() |
file = open('all_tc_sents_200.txt','w') |
file.write('\n'.join(tcsents)) |
file.close() |