housearch commited on
Commit
a1208a8
·
verified ·
1 Parent(s): 51c9bd9

Upload upload_captions.py

Browse files
Files changed (1) hide show
  1. upload_captions.py +124 -0
upload_captions.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from datasets import Dataset, Features, Value, Image
4
+ from huggingface_hub import HfApi, login
5
+
6
+ def load_json_captions(json_path):
7
+ """
8
+ 从JSON文件加载标题数据
9
+ """
10
+ try:
11
+ with open(json_path, 'r', encoding='utf-8') as f:
12
+ captions = json.load(f)
13
+ return captions
14
+ except Exception as e:
15
+ print(f"读取JSON文件 {json_path} 时出错: {e}")
16
+ return {}
17
+
18
+ def load_images_and_captions(directory_path):
19
+ """
20
+ 加载目录中的图像和对应的JSON格式标题文件。
21
+ """
22
+ data = []
23
+
24
+ # 获取目录中的所有文件
25
+ files = os.listdir(directory_path)
26
+
27
+ # 找出所有JPEG文件
28
+ image_files = [f for f in files if f.lower().endswith(('.jpg', '.jpeg'))]
29
+ print(f"找到 {len(image_files)} 个图像文件")
30
+
31
+ # 找出所有JSON标题文件
32
+ caption_files = [f for f in files if f.lower().endswith('.json') and f.startswith('caption_')]
33
+ print(f"找到 {len(caption_files)} 个标题文件")
34
+
35
+ # 加载所有JSON标题文件
36
+ captions = {}
37
+ for caption_file in caption_files:
38
+ model_name = caption_file.replace('caption_', '').replace('.json', '')
39
+ caption_path = os.path.join(directory_path, caption_file)
40
+ captions[model_name] = load_json_captions(caption_path)
41
+
42
+ # 为每个图像创建数据条目
43
+ for image_file in image_files:
44
+ image_path = os.path.join(directory_path, image_file)
45
+
46
+ # 获取不同模型的标题
47
+ model_captions = {}
48
+ for model, caption_data in captions.items():
49
+ if image_file in caption_data:
50
+ model_captions[f"caption_{model}"] = caption_data[image_file]
51
+ else:
52
+ model_captions[f"caption_{model}"] = ""
53
+
54
+ # 读取图像文件(二进制模式)
55
+ try:
56
+ with open(image_path, 'rb') as f:
57
+ image_bytes = f.read()
58
+
59
+ # 创建数据条目
60
+ data_item = {
61
+ "file_name": image_file,
62
+ "image": {"bytes": image_bytes},
63
+ }
64
+ # 添加所有模型的标题
65
+ data_item.update(model_captions)
66
+
67
+ # 将数据添加到列表
68
+ data.append(data_item)
69
+
70
+ except Exception as e:
71
+ print(f"读取图像文件 {image_file} 时出错: {e}")
72
+
73
+ return data
74
+
75
+ def create_and_push_dataset(data, dataset_name, token):
76
+ """
77
+ 从图像和标题创建数据集并推送到Hugging Face。
78
+ """
79
+ # 使用token登录
80
+ login(token)
81
+
82
+ # 获取第一个数据项以确定特征结构
83
+ if not data:
84
+ print("没有数据可上传")
85
+ return
86
+
87
+ first_item = data[0]
88
+ feature_dict = {
89
+ "file_name": Value("string"),
90
+ "image": Image()
91
+ }
92
+
93
+ # 添加所有标题字段
94
+ for key in first_item:
95
+ if key.startswith("caption_"):
96
+ feature_dict[key] = Value("string")
97
+
98
+ # 创建Dataset对象
99
+ features = Features(feature_dict)
100
+ dataset = Dataset.from_list(data, features=features)
101
+
102
+ # 推送到Hub
103
+ dataset.push_to_hub(
104
+ dataset_name,
105
+ private=True # 如果您希望数据集是公开的,设置为False
106
+ )
107
+
108
+ print(f"数据集成功上传到: https://huggingface.co/datasets/{dataset_name}")
109
+
110
+ if __name__ == "__main__":
111
+ # 配置
112
+ IMAGE_CAPTION_DIR = "/Users/chrishsu/Documents/caption"
113
+ DATASET_NAME = "housearch/landscape" # 从您的错误消息中看到的数据集名称
114
+
115
+ # 从https://huggingface.co/settings/tokens获取token
116
+ HF_TOKEN = "填入你的token"
117
+
118
+ # 加载图像和标题
119
+ print(f"从 {IMAGE_CAPTION_DIR} 加载图像和标题...")
120
+ data = load_images_and_captions(IMAGE_CAPTION_DIR)
121
+ print(f"已加载 {len(data)} 个图像文件及其标题")
122
+
123
+ # 上传到Hugging Face
124
+ create_and_push_dataset(data, DATASET_NAME, HF_TOKEN)