OpenFactCheck-Prerelease
/
src
/openfactcheck
/solvers
/webservice
/factcheckgpt_utils
/data_util.py
import csv | |
import json | |
import numpy as np | |
from collections import Counter | |
from typing import Dict, List, Any | |
def save_to_file(text, filename='error_output.txt'): | |
"""Save a string to a file line by line.""" | |
with open(filename, 'a', encoding='utf-8') as file: | |
file.write(text + '\n') | |
def majority_vote(input_list): | |
# Use Counter to count occurrences of each element | |
counter = Counter(input_list) | |
# Find the element with the maximum count (majority) | |
majority_element = max(counter, key=counter.get) | |
# Return the majority element | |
return majority_element | |
def is_float(string): | |
if string.replace(".", "").isnumeric(): | |
return True | |
else: | |
return False | |
def save_json(dictionary: Dict[str, Any], save_dir: str) -> None: | |
# Serializing json | |
json_object = json.dumps(dictionary, indent=4, ensure_ascii=False) | |
# Writing to sample.json | |
with open(save_dir, "w", encoding='utf-8') as outfile: | |
outfile.write(json_object) | |
def read_json(filepath: str) -> Dict[str, Any]: | |
data = {} | |
with open(filepath, 'r', encoding='utf-8') as file: | |
data = json.load(file) | |
return data | |
def list_to_dict(data: List[Dict[str, Any]]) -> Dict[int, Any]: | |
temp = {} | |
for i, d in enumerate(data): | |
temp[i] = d | |
return temp | |
def load_jsonl(path): | |
data = [] | |
with open(path, 'r', encoding='utf-8') as reader: | |
for line in reader: | |
data.append(json.loads(line)) | |
return data | |
# def load_jsonl(input_path) -> list: | |
# """ | |
# Read list of objects from a JSON lines file. | |
# """ | |
# data = [] | |
# with open(input_path, 'r', encoding='utf-8') as f: | |
# for line in f: | |
# data.append(json.loads(line.rstrip('\n|\r'))) | |
# print('Loaded {} records from {}'.format(len(data), input_path)) | |
# return data | |
def dump_jsonl(data, output_path, append=False): | |
""" | |
Write list of objects to a JSON lines file. | |
""" | |
mode = 'a+' if append else 'w' | |
with open(output_path, mode, encoding='utf-8') as f: | |
for line in data: | |
json_record = json.dumps(line, ensure_ascii=False) | |
f.write(json_record + '\n') | |
print('Wrote {} records to {}'.format(len(data), output_path)) | |
def cosine(u, v): | |
"""based on embeddings and calculate cosine similarity""" | |
return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)) | |
def read_csv(input_file, quotechar=None): | |
with open(input_file, "r", encoding="utf-8") as f: | |
reader = csv.reader(f, delimiter="\t", quotechar=quotechar) | |
lines = [] | |
for line in reader: | |
lines.append(line) | |
return lines | |
def save_csv(header, data, output_file): | |
with open(output_file, 'w', encoding='UTF8', newline='') as f: | |
writer = csv.writer(f, delimiter='\t') | |
# write the header | |
writer.writerow(header) | |
# write multiple rows | |
writer.writerows(data) | |
def save_array(filename, embeddings): | |
# save embeddings into file | |
with open(filename, 'wb') as f: | |
np.save(f, embeddings) | |
def load_array(filename): | |
with open(filename, 'rb') as f: | |
a = np.load(f) | |
return a | |
def read_txt(input_file): | |
with open(input_file, "r", encoding="utf-8") as f: | |
return f.readlines() | |
def save_txt(data, output_file): | |
with open(output_file, "w", encoding="utf-8") as writer: | |
writer.write("\n".join(data)) | |
def clean_text(text): | |
for mark in ['"', '-', '\t', ' ']: | |
for i in [5, 4, 3, 2]: | |
marks = mark * i | |
text = text.replace(marks, '') | |
return text | |