Spaces:
Running
Running
#!/usr/bin/env python | |
""" | |
Convert CLC-FCE dataset (The Cambridge Learner Corpus) to the parallel sentences format. | |
""" | |
import argparse | |
import glob | |
import os | |
import re | |
from xml.etree import cElementTree | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from tqdm import tqdm | |
def annotate_fce_doc(xml): | |
"""Takes a FCE xml document and yields sentences with annotated errors.""" | |
result = [] | |
doc = cElementTree.fromstring(xml) | |
paragraphs = doc.findall('head/text/*/coded_answer/p') | |
for p in paragraphs: | |
text = _get_formatted_text(p) | |
result.append(text) | |
return '\n'.join(result) | |
def _get_formatted_text(elem, ignore_tags=None): | |
text = elem.text or '' | |
ignore_tags = [tag.upper() for tag in (ignore_tags or [])] | |
correct = None | |
mistake = None | |
for child in elem.getchildren(): | |
tag = child.tag.upper() | |
if tag == 'NS': | |
text += _get_formatted_text(child) | |
elif tag == 'UNKNOWN': | |
text += ' UNKNOWN ' | |
elif tag == 'C': | |
assert correct is None | |
correct = _get_formatted_text(child) | |
elif tag == 'I': | |
assert mistake is None | |
mistake = _get_formatted_text(child) | |
elif tag in ignore_tags: | |
pass | |
else: | |
raise ValueError(f"Unknown tag `{child.tag}`", text) | |
if correct or mistake: | |
correct = correct or '' | |
mistake = mistake or '' | |
if '=>' not in mistake: | |
text += f'{{{mistake}=>{correct}}}' | |
else: | |
text += mistake | |
text += elem.tail or '' | |
return text | |
def convert_fce(fce_dir): | |
"""Processes the whole FCE directory. Yields annotated documents (strings).""" | |
# Ensure we got the valid dataset path | |
if not os.path.isdir(fce_dir): | |
raise UserWarning( | |
f"{fce_dir} is not a valid path") | |
dataset_dir = os.path.join(fce_dir, 'dataset') | |
if not os.path.exists(dataset_dir): | |
raise UserWarning( | |
f"{fce_dir} doesn't point to a dataset's root dir") | |
# Convert XML docs to the corpora format | |
filenames = sorted(glob.glob(os.path.join(dataset_dir, '*/*.xml'))) | |
docs = [] | |
for filename in filenames: | |
with open(filename, encoding='utf-8') as f: | |
doc = annotate_fce_doc(f.read()) | |
docs.append(doc) | |
return docs | |
def main(): | |
fce = convert_fce(args.fce_dataset_path) | |
with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \ | |
open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied: | |
for doc in tqdm(fce, unit='doc'): | |
sents = re.split(r"\n +\n", doc) | |
for sent in sents: | |
tokenized_sents = sent_tokenize(sent) | |
for i in range(len(tokenized_sents)): | |
if re.search(r"[{>][.?!]$", tokenized_sents[i]): | |
tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1] | |
tokenized_sents[i] = "" | |
regexp = r'{([^{}]*?)=>([^{}]*?)}' | |
original = re.sub(regexp, r"\1", tokenized_sents[i]) | |
applied = re.sub(regexp, r"\2", tokenized_sents[i]) | |
# filter out nested alerts | |
if original != "" and applied != "" and not re.search(r"[{}=]", original) \ | |
and not re.search(r"[{}=]", applied): | |
out_original.write(" ".join(word_tokenize(original)) + "\n") | |
out_applied.write(" ".join(word_tokenize(applied)) + "\n") | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description=( | |
"Convert CLC-FCE dataset to the parallel sentences format.")) | |
parser.add_argument('fce_dataset_path', | |
help='Path to the folder with the FCE dataset') | |
parser.add_argument('--output', | |
help='Path to the output folder') | |
args = parser.parse_args() | |
main() | |