|
import os |
|
import json |
|
import pandas as pd |
|
from pathlib import Path |
|
from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature |
|
|
|
|
|
def process_and_push_dataset( |
|
data_dir: str, hub_repo: str, token: str, private: bool = True |
|
): |
|
""" |
|
Process local dataset files and push to Hugging Face Hub. |
|
|
|
Args: |
|
data_dir (str): Path to the data directory containing submission folders |
|
hub_repo (str): Name of the Hugging Face repository to push to |
|
private (bool): Whether to make the pushed dataset private |
|
|
|
Returns: |
|
datasets.Dataset: The processed dataset |
|
""" |
|
|
|
all_records = [] |
|
|
|
|
|
for root, dirs, files in os.walk(data_dir): |
|
for file in files: |
|
if file == "question.json": |
|
file_path = Path(root) / file |
|
try: |
|
|
|
with open(file_path, "r", encoding="utf-8") as f: |
|
record = json.load(f) |
|
|
|
|
|
folder_path = os.path.dirname(file_path) |
|
|
|
|
|
if "question_images" in record: |
|
record["question_images"] = [ |
|
str(Path(folder_path) / img_path) |
|
for img_path in record["question_images"] |
|
if img_path |
|
] |
|
|
|
if "rationale_images" in record: |
|
record["rationale_images"] = [ |
|
str(Path(folder_path) / img_path) |
|
for img_path in record["rationale_images"] |
|
if img_path |
|
] |
|
|
|
|
|
author_info = record.pop("author_info", {}) |
|
record.update( |
|
{f"author_{k}": v for k, v in author_info.items()} |
|
) |
|
|
|
|
|
all_records.append(record) |
|
except Exception as e: |
|
print(f"Error processing {file_path}: {e}") |
|
|
|
|
|
df = pd.DataFrame(all_records) |
|
|
|
|
|
if not df.empty and "custom_id" in df.columns: |
|
df = df.sort_values("custom_id") |
|
|
|
|
|
required_columns = { |
|
"custom_id": "", |
|
"author_name": "", |
|
"author_email_address": "", |
|
"author_institution": "", |
|
"question_categories": [], |
|
"question": "", |
|
"question_images": [], |
|
"final_answer": "", |
|
"rationale_text": "", |
|
"rationale_images": [], |
|
"image_attribution": "", |
|
"subquestions_1_text": "", |
|
"subquestions_1_answer": "", |
|
"subquestions_2_text": "", |
|
"subquestions_2_answer": "", |
|
"subquestions_3_text": "", |
|
"subquestions_3_answer": "", |
|
"subquestions_4_text": "", |
|
"subquestions_4_answer": "", |
|
"subquestions_5_text": "", |
|
"subquestions_5_answer": "", |
|
} |
|
|
|
for col, default_value in required_columns.items(): |
|
if col not in df.columns: |
|
df[col] = default_value |
|
|
|
|
|
features = Features( |
|
{ |
|
"custom_id": Value("string"), |
|
"question": Value("string"), |
|
"question_images": Sequence(ImageFeature()), |
|
"question_categories": Sequence(Value("string")), |
|
"final_answer": Value("string"), |
|
"rationale_text": Value("string"), |
|
"rationale_images": Sequence(ImageFeature()), |
|
"image_attribution": Value("string"), |
|
"subquestions_1_text": Value("string"), |
|
"subquestions_1_answer": Value("string"), |
|
"subquestions_2_text": Value("string"), |
|
"subquestions_2_answer": Value("string"), |
|
"subquestions_3_text": Value("string"), |
|
"subquestions_3_answer": Value("string"), |
|
"subquestions_4_text": Value("string"), |
|
"subquestions_4_answer": Value("string"), |
|
"subquestions_5_text": Value("string"), |
|
"subquestions_5_answer": Value("string"), |
|
"author_name": Value("string"), |
|
"author_email_address": Value("string"), |
|
"author_institution": Value("string"), |
|
} |
|
) |
|
|
|
|
|
dataset_dict = {col: df[col].tolist() for col in features.keys()} |
|
|
|
|
|
dataset = Dataset.from_dict(dataset_dict, features=features) |
|
|
|
|
|
dataset.push_to_hub(hub_repo, private=private, max_shard_size="200MB", token=token) |
|
|
|
print(f"\nDataset Statistics:") |
|
print(f"Total number of submissions: {len(dataset)}") |
|
print(f"\nSuccessfully pushed dataset to {hub_repo}") |
|
|
|
return dataset |
|
|