|
""" |
|
1.創建一個data的資料夾,把圖檔和caption的json檔案上傳到data |
|
2.上傳 upload_captions.py 腳本檔 |
|
3.填入你的金鑰 |
|
4.在terminal中執行以下兩行指令 |
|
|
|
pip install datasets |
|
python upload_captions.py |
|
|
|
""" |
|
|
|
import os |
|
import json |
|
from datasets import Dataset, Features, Value, Image |
|
from huggingface_hub import HfApi, login |
|
|
|
def load_json_captions(json_path): |
|
""" |
|
从JSON文件加载标题数据 |
|
""" |
|
try: |
|
with open(json_path, 'r', encoding='utf-8') as f: |
|
captions = json.load(f) |
|
return captions |
|
except Exception as e: |
|
print(f"读取JSON文件 {json_path} 时出错: {e}") |
|
return {} |
|
|
|
def load_images_and_captions(directory_path): |
|
""" |
|
加载目录中的图像和对应的JSON格式标题文件。 |
|
""" |
|
data = [] |
|
|
|
|
|
files = os.listdir(directory_path) |
|
|
|
|
|
image_files = [f for f in files if f.lower().endswith(('.jpg', '.jpeg'))] |
|
print(f"找到 {len(image_files)} 个图像文件") |
|
|
|
|
|
caption_files = [f for f in files if f.lower().endswith('.json') and f.startswith('caption_')] |
|
print(f"找到 {len(caption_files)} 个标题文件") |
|
|
|
|
|
captions = {} |
|
for caption_file in caption_files: |
|
model_name = caption_file.replace('caption_', '').replace('.json', '') |
|
caption_path = os.path.join(directory_path, caption_file) |
|
captions[model_name] = load_json_captions(caption_path) |
|
|
|
|
|
for image_file in image_files: |
|
image_path = os.path.join(directory_path, image_file) |
|
|
|
|
|
model_captions = {} |
|
for model, caption_data in captions.items(): |
|
if image_file in caption_data: |
|
model_captions[f"caption_{model}"] = caption_data[image_file] |
|
else: |
|
model_captions[f"caption_{model}"] = "" |
|
|
|
|
|
try: |
|
with open(image_path, 'rb') as f: |
|
image_bytes = f.read() |
|
|
|
|
|
data_item = { |
|
"file_name": image_file, |
|
"image": {"bytes": image_bytes}, |
|
} |
|
|
|
data_item.update(model_captions) |
|
|
|
|
|
data.append(data_item) |
|
|
|
except Exception as e: |
|
print(f"读取图像文件 {image_file} 时出错: {e}") |
|
|
|
return data |
|
|
|
def create_and_push_dataset(data, dataset_name, token): |
|
""" |
|
从图像和标题创建数据集并推送到Hugging Face。 |
|
""" |
|
|
|
login(token) |
|
|
|
|
|
if not data: |
|
print("没有数据可上传") |
|
return |
|
|
|
first_item = data[0] |
|
feature_dict = { |
|
"file_name": Value("string"), |
|
"image": Image() |
|
} |
|
|
|
|
|
for key in first_item: |
|
if key.startswith("caption_"): |
|
feature_dict[key] = Value("string") |
|
|
|
|
|
features = Features(feature_dict) |
|
dataset = Dataset.from_list(data, features=features) |
|
|
|
|
|
dataset.push_to_hub( |
|
dataset_name, |
|
private=False |
|
) |
|
|
|
print(f"数据集成功上传到: https://huggingface.co/datasets/{dataset_name}") |
|
|
|
if __name__ == "__main__": |
|
|
|
IMAGE_CAPTION_DIR = "/workspace/data" |
|
DATASET_NAME = "housearch/Park-PFI" |
|
|
|
|
|
|
|
HF_TOKEN = "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" |
|
|
|
|
|
print(f"从 {IMAGE_CAPTION_DIR} 加载图像和标题...") |
|
data = load_images_and_captions(IMAGE_CAPTION_DIR) |
|
print(f"已加载 {len(data)} 个图像文件及其标题") |
|
|
|
|
|
create_and_push_dataset(data, DATASET_NAME, HF_TOKEN) |