""" 1.創建一個data的資料夾,把圖檔和caption的json檔案上傳到data 2.上傳 upload_captions.py 腳本檔 3.填入你的金鑰 4.在terminal中執行以下兩行指令 pip install datasets python upload_captions.py """ import os import json from datasets import Dataset, Features, Value, Image from huggingface_hub import HfApi, login def load_json_captions(json_path): """ 从JSON文件加载标题数据 """ try: with open(json_path, 'r', encoding='utf-8') as f: captions = json.load(f) return captions except Exception as e: print(f"读取JSON文件 {json_path} 时出错: {e}") return {} def load_images_and_captions(directory_path): """ 加载目录中的图像和对应的JSON格式标题文件。 """ data = [] # 获取目录中的所有文件 files = os.listdir(directory_path) # 找出所有JPEG文件 image_files = [f for f in files if f.lower().endswith(('.jpg', '.jpeg'))] print(f"找到 {len(image_files)} 个图像文件") # 找出所有JSON标题文件 caption_files = [f for f in files if f.lower().endswith('.json') and f.startswith('caption_')] print(f"找到 {len(caption_files)} 个标题文件") # 加载所有JSON标题文件 captions = {} for caption_file in caption_files: model_name = caption_file.replace('caption_', '').replace('.json', '') caption_path = os.path.join(directory_path, caption_file) captions[model_name] = load_json_captions(caption_path) # 为每个图像创建数据条目 for image_file in image_files: image_path = os.path.join(directory_path, image_file) # 获取不同模型的标题 model_captions = {} for model, caption_data in captions.items(): if image_file in caption_data: model_captions[f"caption_{model}"] = caption_data[image_file] else: model_captions[f"caption_{model}"] = "" # 读取图像文件(二进制模式) try: with open(image_path, 'rb') as f: image_bytes = f.read() # 创建数据条目 data_item = { "file_name": image_file, "image": {"bytes": image_bytes}, } # 添加所有模型的标题 data_item.update(model_captions) # 将数据添加到列表 data.append(data_item) except Exception as e: print(f"读取图像文件 {image_file} 时出错: {e}") return data def create_and_push_dataset(data, dataset_name, token): """ 从图像和标题创建数据集并推送到Hugging Face。 """ # 使用token登录 login(token) # 获取第一个数据项以确定特征结构 if not data: print("没有数据可上传") return first_item = data[0] feature_dict = { "file_name": Value("string"), "image": Image() } # 添加所有标题字段 for key in first_item: if key.startswith("caption_"): feature_dict[key] = Value("string") # 创建Dataset对象 features = Features(feature_dict) dataset = Dataset.from_list(data, features=features) # 推送到Hub dataset.push_to_hub( dataset_name, private=False # 如果您希望数据集是公开的,设置为False ) print(f"数据集成功上传到: https://huggingface.co/datasets/{dataset_name}") if __name__ == "__main__": # 配置 IMAGE_CAPTION_DIR = "/workspace/data" DATASET_NAME = "housearch/Park-PFI" # 从您的错误消息中看到的数据集名称 # 从https://huggingface.co/settings/tokens获取token #HF_TOKEN = "填入你的token" HF_TOKEN = "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" # 加载图像和标题 print(f"从 {IMAGE_CAPTION_DIR} 加载图像和标题...") data = load_images_and_captions(IMAGE_CAPTION_DIR) print(f"已加载 {len(data)} 个图像文件及其标题") # 上传到Hugging Face create_and_push_dataset(data, DATASET_NAME, HF_TOKEN)