Upload upload_captions.py
Browse files- upload_captions.py +124 -0
upload_captions.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from datasets import Dataset, Features, Value, Image
|
4 |
+
from huggingface_hub import HfApi, login
|
5 |
+
|
6 |
+
def load_json_captions(json_path):
|
7 |
+
"""
|
8 |
+
从JSON文件加载标题数据
|
9 |
+
"""
|
10 |
+
try:
|
11 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
12 |
+
captions = json.load(f)
|
13 |
+
return captions
|
14 |
+
except Exception as e:
|
15 |
+
print(f"读取JSON文件 {json_path} 时出错: {e}")
|
16 |
+
return {}
|
17 |
+
|
18 |
+
def load_images_and_captions(directory_path):
|
19 |
+
"""
|
20 |
+
加载目录中的图像和对应的JSON格式标题文件。
|
21 |
+
"""
|
22 |
+
data = []
|
23 |
+
|
24 |
+
# 获取目录中的所有文件
|
25 |
+
files = os.listdir(directory_path)
|
26 |
+
|
27 |
+
# 找出所有JPEG文件
|
28 |
+
image_files = [f for f in files if f.lower().endswith(('.jpg', '.jpeg'))]
|
29 |
+
print(f"找到 {len(image_files)} 个图像文件")
|
30 |
+
|
31 |
+
# 找出所有JSON标题文件
|
32 |
+
caption_files = [f for f in files if f.lower().endswith('.json') and f.startswith('caption_')]
|
33 |
+
print(f"找到 {len(caption_files)} 个标题文件")
|
34 |
+
|
35 |
+
# 加载所有JSON标题文件
|
36 |
+
captions = {}
|
37 |
+
for caption_file in caption_files:
|
38 |
+
model_name = caption_file.replace('caption_', '').replace('.json', '')
|
39 |
+
caption_path = os.path.join(directory_path, caption_file)
|
40 |
+
captions[model_name] = load_json_captions(caption_path)
|
41 |
+
|
42 |
+
# 为每个图像创建数据条目
|
43 |
+
for image_file in image_files:
|
44 |
+
image_path = os.path.join(directory_path, image_file)
|
45 |
+
|
46 |
+
# 获取不同模型的标题
|
47 |
+
model_captions = {}
|
48 |
+
for model, caption_data in captions.items():
|
49 |
+
if image_file in caption_data:
|
50 |
+
model_captions[f"caption_{model}"] = caption_data[image_file]
|
51 |
+
else:
|
52 |
+
model_captions[f"caption_{model}"] = ""
|
53 |
+
|
54 |
+
# 读取图像文件(二进制模式)
|
55 |
+
try:
|
56 |
+
with open(image_path, 'rb') as f:
|
57 |
+
image_bytes = f.read()
|
58 |
+
|
59 |
+
# 创建数据条目
|
60 |
+
data_item = {
|
61 |
+
"file_name": image_file,
|
62 |
+
"image": {"bytes": image_bytes},
|
63 |
+
}
|
64 |
+
# 添加所有模型的标题
|
65 |
+
data_item.update(model_captions)
|
66 |
+
|
67 |
+
# 将数据添加到列表
|
68 |
+
data.append(data_item)
|
69 |
+
|
70 |
+
except Exception as e:
|
71 |
+
print(f"读取图像文件 {image_file} 时出错: {e}")
|
72 |
+
|
73 |
+
return data
|
74 |
+
|
75 |
+
def create_and_push_dataset(data, dataset_name, token):
|
76 |
+
"""
|
77 |
+
从图像和标题创建数据集并推送到Hugging Face。
|
78 |
+
"""
|
79 |
+
# 使用token登录
|
80 |
+
login(token)
|
81 |
+
|
82 |
+
# 获取第一个数据项以确定特征结构
|
83 |
+
if not data:
|
84 |
+
print("没有数据可上传")
|
85 |
+
return
|
86 |
+
|
87 |
+
first_item = data[0]
|
88 |
+
feature_dict = {
|
89 |
+
"file_name": Value("string"),
|
90 |
+
"image": Image()
|
91 |
+
}
|
92 |
+
|
93 |
+
# 添加所有标题字段
|
94 |
+
for key in first_item:
|
95 |
+
if key.startswith("caption_"):
|
96 |
+
feature_dict[key] = Value("string")
|
97 |
+
|
98 |
+
# 创建Dataset对象
|
99 |
+
features = Features(feature_dict)
|
100 |
+
dataset = Dataset.from_list(data, features=features)
|
101 |
+
|
102 |
+
# 推送到Hub
|
103 |
+
dataset.push_to_hub(
|
104 |
+
dataset_name,
|
105 |
+
private=True # 如果您希望数据集是公开的,设置为False
|
106 |
+
)
|
107 |
+
|
108 |
+
print(f"数据集成功上传到: https://huggingface.co/datasets/{dataset_name}")
|
109 |
+
|
110 |
+
if __name__ == "__main__":
|
111 |
+
# 配置
|
112 |
+
IMAGE_CAPTION_DIR = "/Users/chrishsu/Documents/caption"
|
113 |
+
DATASET_NAME = "housearch/landscape" # 从您的错误消息中看到的数据集名称
|
114 |
+
|
115 |
+
# 从https://huggingface.co/settings/tokens获取token
|
116 |
+
HF_TOKEN = "填入你的token"
|
117 |
+
|
118 |
+
# 加载图像和标题
|
119 |
+
print(f"从 {IMAGE_CAPTION_DIR} 加载图像和标题...")
|
120 |
+
data = load_images_and_captions(IMAGE_CAPTION_DIR)
|
121 |
+
print(f"已加载 {len(data)} 个图像文件及其标题")
|
122 |
+
|
123 |
+
# 上传到Hugging Face
|
124 |
+
create_and_push_dataset(data, DATASET_NAME, HF_TOKEN)
|