housearch
/

thu

Model card Files Files and versions Community

thu

File size: 3,714 Bytes

"""
1.創建一個data的資料夾，把圖檔和caption的json檔案上傳到data
2.上傳 upload_captions.py 腳本檔
3.填入你的金鑰
4.在terminal中執行以下兩行指令

pip install datasets
python upload_captions.py

"""

import os
import json
from datasets import Dataset, Features, Value, Image
from huggingface_hub import HfApi, login

def load_json_captions(json_path):
	"""
	从JSON文件加载标题数据
	"""
	try:
		with open(json_path, 'r', encoding='utf-8') as f:
			captions = json.load(f)
		return captions
	except Exception as e:
		print(f"读取JSON文件 {json_path} 时出错: {e}")
		return {}

def load_images_and_captions(directory_path):
	"""
	加载目录中的图像和对应的JSON格式标题文件。
	"""
	data = []
	
	# 获取目录中的所有文件
	files = os.listdir(directory_path)
	
	# 找出所有JPEG文件
	image_files = [f for f in files if f.lower().endswith(('.jpg', '.jpeg'))]
	print(f"找到 {len(image_files)} 个图像文件")
	
	# 找出所有JSON标题文件
	caption_files = [f for f in files if f.lower().endswith('.json') and f.startswith('caption_')]
	print(f"找到 {len(caption_files)} 个标题文件")
	
	# 加载所有JSON标题文件
	captions = {}
	for caption_file in caption_files:
		model_name = caption_file.replace('caption_', '').replace('.json', '')
		caption_path = os.path.join(directory_path, caption_file)
		captions[model_name] = load_json_captions(caption_path)
	
	# 为每个图像创建数据条目
	for image_file in image_files:
		image_path = os.path.join(directory_path, image_file)
		
		# 获取不同模型的标题
		model_captions = {}
		for model, caption_data in captions.items():
			if image_file in caption_data:
				model_captions[f"caption_{model}"] = caption_data[image_file]
			else:
				model_captions[f"caption_{model}"] = ""
		
		# 读取图像文件（二进制模式）
		try:
			with open(image_path, 'rb') as f:
				image_bytes = f.read()
			
			# 创建数据条目
			data_item = {
				"file_name": image_file,
				"image": {"bytes": image_bytes},
			}
			# 添加所有模型的标题
			data_item.update(model_captions)
			
			# 将数据添加到列表
			data.append(data_item)
			
		except Exception as e:
			print(f"读取图像文件 {image_file} 时出错: {e}")
	
	return data

def create_and_push_dataset(data, dataset_name, token):
	"""
	从图像和标题创建数据集并推送到Hugging Face。
	"""
	# 使用token登录
	login(token)
	
	# 获取第一个数据项以确定特征结构
	if not data:
		print("没有数据可上传")
		return
		
	first_item = data[0]
	feature_dict = {
		"file_name": Value("string"),
		"image": Image()
	}
	
	# 添加所有标题字段
	for key in first_item:
		if key.startswith("caption_"):
			feature_dict[key] = Value("string")
	
	# 创建Dataset对象
	features = Features(feature_dict)
	dataset = Dataset.from_list(data, features=features)
	
	# 推送到Hub
	dataset.push_to_hub(
		dataset_name,
		private=False  # 如果您希望数据集是公开的，设置为False
	)
	
	print(f"数据集成功上传到: https://huggingface.co/datasets/{dataset_name}")

if __name__ == "__main__":
	# 配置
	IMAGE_CAPTION_DIR = "/workspace/data"
	DATASET_NAME = "housearch/Park-PFI"  # 从您的错误消息中看到的数据集名称
	
	# 从https://huggingface.co/settings/tokens获取token
	#HF_TOKEN = "填入你的token"
	HF_TOKEN = "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"
	
	# 加载图像和标题
	print(f"从 {IMAGE_CAPTION_DIR} 加载图像和标题...")
	data = load_images_and_captions(IMAGE_CAPTION_DIR)
	print(f"已加载 {len(data)} 个图像文件及其标题")
	
	# 上传到Hugging Face
	create_and_push_dataset(data, DATASET_NAME, HF_TOKEN)