|
import json |
|
import os |
|
import random |
|
import shutil |
|
from datetime import datetime |
|
|
|
import langdetect |
|
import nltk |
|
import pandas as pd |
|
from langdetect import DetectorFactory, LangDetectException |
|
|
|
DATA_FILE = "data/crashes/thecrashes_data_all_text.json" |
|
|
|
DEV_PORTION = .10 |
|
|
|
random.seed(2001) |
|
DetectorFactory.seed = 0 |
|
|
|
|
|
def is_a_real_time(timestamp): |
|
"""Helper function, checks if a given timestamp really has a time""" |
|
|
|
|
|
if timestamp.hour == timestamp.minute == timestamp.second == 0: |
|
return False |
|
return True |
|
|
|
|
|
def main(): |
|
process_events() |
|
|
|
|
|
def detect_language(article): |
|
if article["alltext"]: |
|
sample = article["alltext"] |
|
elif article["summary"]: |
|
sample = article["summary"] |
|
else: |
|
sample = article["title"] |
|
|
|
try: |
|
return langdetect.detect(sample) |
|
except LangDetectException: |
|
print(f"\tCould not detect language for text_id={article['id']}") |
|
print(f"\tSample={sample})") |
|
print() |
|
return "UNK_LANG" |
|
|
|
|
|
def extract_text_info(event): |
|
ev_text_lines = [] |
|
ev_id_lines = [] |
|
ev_meta_rows = [] |
|
|
|
for article in event["articles"]: |
|
text_id = article["id"] |
|
try: |
|
pubdate = datetime.fromisoformat(article["publishedtime"]).strftime("%Y-%m-%d %H:%M:%S") |
|
except ValueError: |
|
print(f"\t\tcould not parse date {article['publishedtime']}") |
|
pubdate = None |
|
url = article["url"] |
|
provider = article["sitename"] |
|
title = article["title"] |
|
language = detect_language(article) |
|
ev_meta_rows.append({ |
|
"event_id": event["id"], |
|
"text_id": text_id, |
|
"pubdate": pubdate, |
|
"language": language, |
|
"url": url, |
|
"provider": provider, |
|
"title": title |
|
}) |
|
|
|
summary = article["summary"] |
|
body = article["alltext"] |
|
|
|
text_lines = [] |
|
id_lines = [] |
|
|
|
for line in segment(title, language): |
|
text_lines.append(line) |
|
id_lines.append(f"event {event['id']}\ttext {text_id}\ttitle") |
|
|
|
for line in segment(summary, language): |
|
text_lines.append(line) |
|
id_lines.append(f"event {event['id']}\ttext {text_id}\tsummary") |
|
|
|
for line in segment(body, language): |
|
text_lines.append(line) |
|
id_lines.append(f"event {event['id']}\ttext {text_id}\tbody") |
|
|
|
ev_text_lines.append(text_lines) |
|
ev_id_lines.append(id_lines) |
|
|
|
return ev_text_lines, ev_id_lines, ev_meta_rows |
|
|
|
|
|
def segment(text, language): |
|
|
|
if language in ["he", "vi"]: |
|
return text |
|
|
|
lang_map = { |
|
"nl": "dutch", |
|
"en": "english", |
|
"es": "spanish", |
|
"de": "german", |
|
"fr": "french", |
|
"ru": "russian", |
|
"pt": "portuguese" |
|
} |
|
|
|
nltk_lang = lang_map.get(language) |
|
|
|
|
|
if not nltk_lang: |
|
if language == "af": |
|
|
|
nltk_lang = "dutch" |
|
else: |
|
print(f"Found an article with unsupported language={language}, falling back to English NLTK") |
|
nltk_lang = "english" |
|
|
|
return nltk.sent_tokenize(text, nltk_lang) |
|
|
|
|
|
def write_to_text_by_event(text_lines, text_meta_lines, event_id, split_to_dir, split): |
|
event_dir = f"{split_to_dir[split]}/{event_id}" |
|
os.makedirs(event_dir, exist_ok=True) |
|
for art_lines, row in zip(text_lines, text_meta_lines): |
|
text_file = f"{event_dir}/{row['text_id']}.txt" |
|
with open(text_file, "w", encoding="utf-8") as f: |
|
for line in art_lines: |
|
f.write(line + os.linesep) |
|
|
|
|
|
def process_events(): |
|
print("Loading data file...") |
|
with open(DATA_FILE, encoding="utf-8") as f: |
|
data = json.load(f) |
|
event_all_rows = [] |
|
event_dev_rows = [] |
|
event_main_rows = [] |
|
|
|
text_all_rows = [] |
|
text_dev_rows = [] |
|
text_main_rows = [] |
|
|
|
|
|
text_file_basenames = { |
|
"all": "output/crashes/split_data/all.texts", |
|
"dev": "output/crashes/split_data/split_dev10.texts", |
|
"main": "output/crashes/split_data/split_main.texts" |
|
} |
|
for split, bn in text_file_basenames.items(): |
|
for ext in [".text.txt", ".ids.txt"]: |
|
f = open(f"{bn}{ext}", "w", encoding="utf-8") |
|
f.close() |
|
|
|
|
|
text_files_by_event_dir = {} |
|
for split in ["all", "dev", "main"]: |
|
prefix = "split_dev10" if split == "dev" else "split_main" if split == "main" else "all" |
|
text_dir = f"output/crashes/split_data/{prefix}_texts_by_event" |
|
text_files_by_event_dir[split] = text_dir |
|
if os.path.exists(text_dir): |
|
shutil.rmtree(text_dir) |
|
os.mkdir(text_dir) |
|
|
|
|
|
def append_to_txt(txt_file, lines): |
|
with open(txt_file, "a", encoding="utf-8") as f_out: |
|
for art_lines in lines: |
|
for line in art_lines: |
|
f_out.write(line + os.linesep) |
|
|
|
print("Processing events...") |
|
for event in data: |
|
event_id = event["id"] |
|
print(f"\tevent_id={event_id}") |
|
try: |
|
timestamp = datetime.fromisoformat(event["date"]) |
|
except ValueError: |
|
timestamp = None |
|
|
|
event_row = { |
|
"event:id": event_id, |
|
"event:date": timestamp.strftime("%Y-%m-%d") if timestamp else None, |
|
"event:time": timestamp.strftime("%H-%M-%S") if timestamp and is_a_real_time(timestamp) else None, |
|
"event:coordinates": f"{event['latitude'], event['longitude']}", |
|
"vehicle_involved": 1 if any(p for p in event["persons"] if p["transportationmode"] in range(5, 14)) else 0 |
|
} |
|
|
|
for health, health_code in (("dead", 3), ("injured", 2)): |
|
all_with_health = [p for p in event["persons"] if p["health"] == health_code] |
|
event_row[f"outcomes:{health}:total"] = len(all_with_health) |
|
event_row[f"outcomes:{health}:child"] = len([p for p in all_with_health if p["child"] == 1]) |
|
for mode, mode_codes in (("pedestrian", [1]), ("cyclist", [2]), ("vehicle", range(5, 14))): |
|
event_row[f"outcomes:{health}:{mode}"] = len([p for p in all_with_health |
|
if p["transportationmode"] in mode_codes]) |
|
|
|
text_lines, text_id_lines, text_meta_rows = extract_text_info(event) |
|
|
|
event_all_rows.append(event_row) |
|
text_all_rows.extend(text_meta_rows) |
|
append_to_txt(text_file_basenames["all"] + ".text.txt", text_lines) |
|
append_to_txt(text_file_basenames["all"] + ".ids.txt", text_id_lines) |
|
write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "all") |
|
|
|
if random.random() < DEV_PORTION: |
|
event_dev_rows.append(event_row) |
|
text_dev_rows.extend(text_meta_rows) |
|
append_to_txt(text_file_basenames["dev"] + ".text.txt", text_lines) |
|
append_to_txt(text_file_basenames["dev"] + ".ids.txt", text_id_lines) |
|
write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "dev") |
|
|
|
else: |
|
event_main_rows.append(event_row) |
|
text_main_rows.extend(text_meta_rows) |
|
append_to_txt(text_file_basenames["main"] + ".text.txt", text_lines) |
|
append_to_txt(text_file_basenames["main"] + ".ids.txt", text_id_lines) |
|
write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "main") |
|
|
|
all_ev_df = pd.DataFrame(event_all_rows) |
|
main_ev_df = pd.DataFrame(event_main_rows) |
|
dev_ev_df = pd.DataFrame(event_dev_rows) |
|
for df, file in ((all_ev_df, "all.events"), (main_ev_df, "split_main.events"), (dev_ev_df, "split_dev10.events")): |
|
df.to_csv(f"output/crashes/split_data/{file}.csv") |
|
|
|
all_txt_df = pd.DataFrame(text_all_rows) |
|
main_txt_df = pd.DataFrame(text_main_rows) |
|
dev_txt_df = pd.DataFrame(text_dev_rows) |
|
for df, file in ((all_txt_df, "all.texts"), (main_txt_df, "split_main.texts"), (dev_txt_df, "split_dev10.texts")): |
|
df.to_csv(f"output/crashes/split_data/{file}.meta.csv") |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|