Spaces:

taesiri
/

ZBEditor

Paused

App Files Files Community

ZBEditor / utils.py

taesiri

backup

227a2de 5 months ago

raw

history blame contribute delete

5.17 kB

	import os
	import json
	import pandas as pd
	from pathlib import Path
	from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature


	def process_and_push_dataset(
	data_dir: str, hub_repo: str, token: str, private: bool = True
	):
	"""
	Process local dataset files and push to Hugging Face Hub.

	Args:
	data_dir (str): Path to the data directory containing submission folders
	hub_repo (str): Name of the Hugging Face repository to push to
	private (bool): Whether to make the pushed dataset private

	Returns:
	datasets.Dataset: The processed dataset
	"""
	# List to store all records
	all_records = []

	# Walk through all subdirectories in data_dir
	for root, dirs, files in os.walk(data_dir):
	for file in files:
	if file == "question.json":
	file_path = Path(root) / file
	try:
	# Read the JSON file
	with open(file_path, "r", encoding="utf-8") as f:
	record = json.load(f)

	# Get the folder path for this record
	folder_path = os.path.dirname(file_path)

	# Fix image paths to include full path
	if "question_images" in record:
	record["question_images"] = [
	str(Path(folder_path) / img_path)
	for img_path in record["question_images"]
	if img_path
	]

	if "rationale_images" in record:
	record["rationale_images"] = [
	str(Path(folder_path) / img_path)
	for img_path in record["rationale_images"]
	if img_path
	]

	# Flatten author_info dictionary
	author_info = record.pop("author_info", {})
	record.update(
	{f"author_{k}": v for k, v in author_info.items()}
	)

	# Add the record
	all_records.append(record)
	except Exception as e:
	print(f"Error processing {file_path}: {e}")

	# Convert to DataFrame
	df = pd.DataFrame(all_records)

	# Sort by custom_id for consistency
	if not df.empty and "custom_id" in df.columns:
	df = df.sort_values("custom_id")

	# Ensure all required columns exist with default values
	required_columns = {
	"custom_id": "",
	"author_name": "",
	"author_email_address": "",
	"author_institution": "",
	"question_categories": [],
	"question": "",
	"question_images": [],
	"final_answer": "",
	"rationale_text": "",
	"rationale_images": [],
	"image_attribution": "",
	"subquestions_1_text": "",
	"subquestions_1_answer": "",
	"subquestions_2_text": "",
	"subquestions_2_answer": "",
	"subquestions_3_text": "",
	"subquestions_3_answer": "",
	"subquestions_4_text": "",
	"subquestions_4_answer": "",
	"subquestions_5_text": "",
	"subquestions_5_answer": "",
	}

	for col, default_value in required_columns.items():
	if col not in df.columns:
	df[col] = default_value

	# Define features
	features = Features(
	{
	"custom_id": Value("string"),
	"question": Value("string"),
	"question_images": Sequence(ImageFeature()),
	"question_categories": Sequence(Value("string")),
	"final_answer": Value("string"),
	"rationale_text": Value("string"),
	"rationale_images": Sequence(ImageFeature()),
	"image_attribution": Value("string"),
	"subquestions_1_text": Value("string"),
	"subquestions_1_answer": Value("string"),
	"subquestions_2_text": Value("string"),
	"subquestions_2_answer": Value("string"),
	"subquestions_3_text": Value("string"),
	"subquestions_3_answer": Value("string"),
	"subquestions_4_text": Value("string"),
	"subquestions_4_answer": Value("string"),
	"subquestions_5_text": Value("string"),
	"subquestions_5_answer": Value("string"),
	"author_name": Value("string"),
	"author_email_address": Value("string"),
	"author_institution": Value("string"),
	}
	)

	# Convert DataFrame to dict of lists (Hugging Face Dataset format)
	dataset_dict = {col: df[col].tolist() for col in features.keys()}

	# Create Dataset directly from dict
	dataset = Dataset.from_dict(dataset_dict, features=features)

	# Push to hub
	dataset.push_to_hub(hub_repo, private=private, max_shard_size="200MB", token=token)

	print(f"\nDataset Statistics:")
	print(f"Total number of submissions: {len(dataset)}")
	print(f"\nSuccessfully pushed dataset to {hub_repo}")

	return dataset