chenxwh
/

AVeriTeC

Model card Files Files and versions Community

AVeriTeC / src /prediction /veracity_with_scraped_text.py

chenxwh

Upload veracity_with_scraped_text.py

a106f67 verified 6 months ago

raw

history blame

2.25 kB

	import os
	import argparse
	import json
	from tqdm import tqdm


	def load_url_text_map(knowledge_store_dir, claim_id):
	url_text_map = {}
	knowledge_file = os.path.join(knowledge_store_dir, f"{claim_id}.json")

	if os.path.exists(knowledge_file):
	with open(knowledge_file, "r") as f:
	for line in f:
	data = json.loads(line)
	url = data["url"]
	url2text = data["url2text"]
	concatenated_text = " ".join(url2text)
	url_text_map[url] = concatenated_text

	return url_text_map


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="Add scraped_text field to the prediction file."
	)
	parser.add_argument(
	"-i",
	"--veracity_prediction_file",
	default="data_store/dev_veracity_prediction.json",
	help="Json file with the veracity predictions.",
	)
	parser.add_argument(
	"-o",
	"--output_file",
	default="data_store/dev_veracity_prediction_for_submission.json",
	help="Json file with the veracity predictions and the scraped_text.",
	)
	parser.add_argument(
	"--knowledge_store_dir",
	type=str,
	help="Directory of json files of the knowledge store containing url2text.",
	)
	args = parser.parse_args()

	predictions = []
	with open(args.veracity_prediction_file) as f:
	predictions = json.load(f)

	for claim in tqdm(predictions, desc="Processing claims"):
	claim_id = claim["claim_id"]
	url_text_map = load_url_text_map(args.knowledge_store_dir, claim_id)

	# Process each evidence in the claim and add scraped_text
	for evidence in claim["evidence"]:
	url = evidence["url"]
	scraped_text = url_text_map.get(url)
	if scraped_text:
	evidence["scraped_text"] = scraped_text
	else:
	print(
	f"Warning: No scraped text found for claim_id {claim_id} and url {url}"
	)

	with open(args.output_file, "w", encoding="utf-8") as output_file:
	json.dump(predictions, output_file, ensure_ascii=False, indent=4)

	print(f"Updated JSON saved to {args.output_file}")