File size: 2,253 Bytes
a106f67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import os
import argparse
import json
from tqdm import tqdm
def load_url_text_map(knowledge_store_dir, claim_id):
url_text_map = {}
knowledge_file = os.path.join(knowledge_store_dir, f"{claim_id}.json")
if os.path.exists(knowledge_file):
with open(knowledge_file, "r") as f:
for line in f:
data = json.loads(line)
url = data["url"]
url2text = data["url2text"]
concatenated_text = " ".join(url2text)
url_text_map[url] = concatenated_text
return url_text_map
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Add scraped_text field to the prediction file."
)
parser.add_argument(
"-i",
"--veracity_prediction_file",
default="data_store/dev_veracity_prediction.json",
help="Json file with the veracity predictions.",
)
parser.add_argument(
"-o",
"--output_file",
default="data_store/dev_veracity_prediction_for_submission.json",
help="Json file with the veracity predictions and the scraped_text.",
)
parser.add_argument(
"--knowledge_store_dir",
type=str,
help="Directory of json files of the knowledge store containing url2text.",
)
args = parser.parse_args()
predictions = []
with open(args.veracity_prediction_file) as f:
predictions = json.load(f)
for claim in tqdm(predictions, desc="Processing claims"):
claim_id = claim["claim_id"]
url_text_map = load_url_text_map(args.knowledge_store_dir, claim_id)
# Process each evidence in the claim and add scraped_text
for evidence in claim["evidence"]:
url = evidence["url"]
scraped_text = url_text_map.get(url)
if scraped_text:
evidence["scraped_text"] = scraped_text
else:
print(
f"Warning: No scraped text found for claim_id {claim_id} and url {url}"
)
with open(args.output_file, "w", encoding="utf-8") as output_file:
json.dump(predictions, output_file, ensure_ascii=False, indent=4)
print(f"Updated JSON saved to {args.output_file}")
|