thu / upload_captions.py
housearch's picture
Update upload_captions.py
2dde7a5 verified
"""
1.創建一個data的資料夾,把圖檔和caption的json檔案上傳到data
2.上傳 upload_captions.py 腳本檔
3.填入你的金鑰
4.在terminal中執行以下兩行指令
pip install datasets
python upload_captions.py
"""
import os
import json
from datasets import Dataset, Features, Value, Image
from huggingface_hub import HfApi, login
def load_json_captions(json_path):
"""
从JSON文件加载标题数据
"""
try:
with open(json_path, 'r', encoding='utf-8') as f:
captions = json.load(f)
return captions
except Exception as e:
print(f"读取JSON文件 {json_path} 时出错: {e}")
return {}
def load_images_and_captions(directory_path):
"""
加载目录中的图像和对应的JSON格式标题文件。
"""
data = []
# 获取目录中的所有文件
files = os.listdir(directory_path)
# 找出所有JPEG文件
image_files = [f for f in files if f.lower().endswith(('.jpg', '.jpeg'))]
print(f"找到 {len(image_files)} 个图像文件")
# 找出所有JSON标题文件
caption_files = [f for f in files if f.lower().endswith('.json') and f.startswith('caption_')]
print(f"找到 {len(caption_files)} 个标题文件")
# 加载所有JSON标题文件
captions = {}
for caption_file in caption_files:
model_name = caption_file.replace('caption_', '').replace('.json', '')
caption_path = os.path.join(directory_path, caption_file)
captions[model_name] = load_json_captions(caption_path)
# 为每个图像创建数据条目
for image_file in image_files:
image_path = os.path.join(directory_path, image_file)
# 获取不同模型的标题
model_captions = {}
for model, caption_data in captions.items():
if image_file in caption_data:
model_captions[f"caption_{model}"] = caption_data[image_file]
else:
model_captions[f"caption_{model}"] = ""
# 读取图像文件(二进制模式)
try:
with open(image_path, 'rb') as f:
image_bytes = f.read()
# 创建数据条目
data_item = {
"file_name": image_file,
"image": {"bytes": image_bytes},
}
# 添加所有模型的标题
data_item.update(model_captions)
# 将数据添加到列表
data.append(data_item)
except Exception as e:
print(f"读取图像文件 {image_file} 时出错: {e}")
return data
def create_and_push_dataset(data, dataset_name, token):
"""
从图像和标题创建数据集并推送到Hugging Face。
"""
# 使用token登录
login(token)
# 获取第一个数据项以确定特征结构
if not data:
print("没有数据可上传")
return
first_item = data[0]
feature_dict = {
"file_name": Value("string"),
"image": Image()
}
# 添加所有标题字段
for key in first_item:
if key.startswith("caption_"):
feature_dict[key] = Value("string")
# 创建Dataset对象
features = Features(feature_dict)
dataset = Dataset.from_list(data, features=features)
# 推送到Hub
dataset.push_to_hub(
dataset_name,
private=False # 如果您希望数据集是公开的,设置为False
)
print(f"数据集成功上传到: https://huggingface.co/datasets/{dataset_name}")
if __name__ == "__main__":
# 配置
IMAGE_CAPTION_DIR = "/workspace/data"
DATASET_NAME = "housearch/Park-PFI" # 从您的错误消息中看到的数据集名称
# 从https://huggingface.co/settings/tokens获取token
#HF_TOKEN = "填入你的token"
HF_TOKEN = "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"
# 加载图像和标题
print(f"从 {IMAGE_CAPTION_DIR} 加载图像和标题...")
data = load_images_and_captions(IMAGE_CAPTION_DIR)
print(f"已加载 {len(data)} 个图像文件及其标题")
# 上传到Hugging Face
create_and_push_dataset(data, DATASET_NAME, HF_TOKEN)