ZihanWang314 commited on
Commit
d686824
·
verified ·
1 Parent(s): 9e6b027

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +10 -0
  2. .gitignore +2 -0
  3. Dockerfile +17 -0
  4. README.md +9 -6
  5. TStar/TStarFramework.py +347 -0
  6. TStar/__init__.py +0 -0
  7. TStar/__pycache__/TStarFramework.cpython-39.pyc +0 -0
  8. TStar/__pycache__/__init__.cpython-39.pyc +0 -0
  9. TStar/__pycache__/interface_llm.cpython-39.pyc +0 -0
  10. TStar/__pycache__/interface_searcher.cpython-39.pyc +0 -0
  11. TStar/__pycache__/interface_yolo.cpython-39.pyc +0 -0
  12. TStar/__pycache__/utils.cpython-39.pyc +0 -0
  13. TStar/interface_llm.py +451 -0
  14. TStar/interface_owl.py +136 -0
  15. TStar/interface_searcher.py +704 -0
  16. TStar/interface_yolo.py +279 -0
  17. TStar/utils.py +105 -0
  18. YOLO-World/.dockerignore +2 -0
  19. YOLO-World/.gitattributes +34 -0
  20. YOLO-World/.gitignore +127 -0
  21. YOLO-World/.gitmodules +3 -0
  22. YOLO-World/Dockerfile +43 -0
  23. YOLO-World/LICENSE +1347 -0
  24. YOLO-World/README.md +238 -0
  25. YOLO-World/assets/finetune_yoloworld.png +3 -0
  26. YOLO-World/assets/reparameterize.png +0 -0
  27. YOLO-World/assets/yolo_arch.png +3 -0
  28. YOLO-World/assets/yolo_logo.png +3 -0
  29. YOLO-World/configs/finetune_coco/README.md +29 -0
  30. YOLO-World/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py +179 -0
  31. YOLO-World/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_mask-refine_finetune_coco.py +181 -0
  32. YOLO-World/configs/finetune_coco/yolo_world_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py +159 -0
  33. YOLO-World/configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py +182 -0
  34. YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +181 -0
  35. YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py +160 -0
  36. YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_mask-refine_finetune_coco.py +161 -0
  37. YOLO-World/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +182 -0
  38. YOLO-World/configs/finetune_coco/yolo_world_v2_s_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +145 -0
  39. YOLO-World/configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +146 -0
  40. YOLO-World/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +184 -0
  41. YOLO-World/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +183 -0
  42. YOLO-World/configs/finetune_coco/yolo_world_v2_xl_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py +173 -0
  43. YOLO-World/configs/image_prompts/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_image_prompt_demo.py +127 -0
  44. YOLO-World/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py +200 -0
  45. YOLO-World/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py +171 -0
  46. YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py +202 -0
  47. YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py +171 -0
  48. YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py +171 -0
  49. YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py +198 -0
  50. YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py +171 -0
.gitattributes CHANGED
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/sample.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ *.png filter=lfs diff=lfs merge=lfs -text
39
+ YOLO-World/demo/sample_images/bus.jpg filter=lfs diff=lfs merge=lfs -text
40
+ YOLO-World/demo/sample_images/zidane.jpg filter=lfs diff=lfs merge=lfs -text
41
+ YOLO-World/third_party/mmyolo/demo/demo.jpg filter=lfs diff=lfs merge=lfs -text
42
+ YOLO-World/third_party/mmyolo/demo/dog.jpg filter=lfs diff=lfs merge=lfs -text
43
+ YOLO-World/third_party/mmyolo/demo/large_image.jpg filter=lfs diff=lfs merge=lfs -text
44
+ YOLO-World/third_party/mmyolo/resources/qq_group_qrcode.jpg filter=lfs diff=lfs merge=lfs -text
45
+ YOLO-World/third_party/mmyolo/resources/zhihu_qrcode.jpg filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ output
2
+ pretrained
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # basic dockerfile
2
+ FROM python:3.10
3
+
4
+ RUN useradd -m -u 1000 user
5
+ USER user
6
+ ENV PATH="/home/user/.local/bin:$PATH"
7
+
8
+ WORKDIR /app
9
+
10
+ COPY --chown=user ./requirements.txt requirements.txt
11
+
12
+ # install deps
13
+ RUN pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 wheel
14
+ RUN pip install -r requirements.txt
15
+
16
+ COPY --chown=user . /app
17
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,15 @@
1
  ---
2
  title: TSTAR
3
- emoji: 🌖
4
- colorFrom: green
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 5.20.1
8
- app_file: app.py
9
  pinned: false
 
10
  ---
11
 
 
 
 
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: TSTAR
3
+ emoji: ⚡️🔍🎥🎥🎥🎥🔍⚡️
4
+ colorFrom: yellow
5
+ colorTo: indigo
6
+ sdk: docker
 
 
7
  pinned: false
8
+ app_port: 7860
9
  ---
10
 
11
+ <!-- sdk: gradio
12
+ sdk_version: 5.9.1
13
+ app_file: app.py -->
14
+
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
TStar/TStarFramework.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ TStarSearcher: Comprehensive Video Frame Search Tool
4
+
5
+ This script allows searching for specific objects within a video using YOLO object detection and GPT-4 for question-answering. It leverages the TStar framework's universal Grounder, YOLO interface, and video searcher to identify relevant frames and answer questions based on the detected objects.
6
+
7
+ Usage:
8
+ python tstar_searcher.py --video_path path/to/video.mp4 --question "Your question here" --options "A) Option1\nB) Option2\nC) Option3\nD) Option4"
9
+ """
10
+
11
+ import os
12
+ import sys
13
+ import cv2
14
+ import torch
15
+ import copy
16
+ import logging
17
+ import argparse
18
+ import numpy as np
19
+ import matplotlib.pyplot as plt
20
+ from dataclasses import dataclass, field
21
+ from typing import List, Optional, Tuple
22
+ from decord import VideoReader, cpu
23
+ from scipy.interpolate import UnivariateSpline
24
+
25
+ # Import custom TStar interfaces
26
+ from TStar.interface_llm import TStarUniversalGrounder
27
+ from TStar.interface_owl import OWLInterface
28
+ from TStar.interface_searcher import TStarSearcher
29
+ from TStar.utils import save_as_gif
30
+ # Configure logging
31
+ logging.basicConfig(
32
+ level=logging.INFO,
33
+ format='%(asctime)s [%(levelname)s] %(message)s',
34
+ handlers=[
35
+ logging.StreamHandler(sys.stdout)
36
+ ]
37
+ )
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ class TStarFramework:
42
+ """
43
+ Main class for performing object-based frame search and question-answering in a video.
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ video_path: str,
49
+ heuristic_scorer: OWLInterface,
50
+ grounder: TStarUniversalGrounder,
51
+ question: str,
52
+ options: str,
53
+ search_nframes: int = 8,
54
+ grid_rows: int = 4,
55
+ grid_cols: int = 4,
56
+ output_dir: str = './output',
57
+ confidence_threshold: float = 0.6,
58
+ search_budget: int = 1000,
59
+ prefix: str = 'stitched_image',
60
+ config_path: Optional[str] = None,
61
+ checkpoint_path: Optional[str] = None,
62
+ device: str = "cuda:0"
63
+ ):
64
+ """
65
+ Initialize VideoSearcher.
66
+
67
+ Args:
68
+ video_path (str): Path to the input video file.
69
+ yolo_scorer (YoloV5Interface): YOLO interface instance.
70
+ grounder (TStarUniversalGrounder): Universal Grounder instance.
71
+ question (str): The question for question-answering.
72
+ options (str): Multiple-choice options for the question.
73
+ search_nframes (int, optional): Number of top frames to return. Default is 8.
74
+ grid_rows (int, optional): Number of rows in the image grid. Default is 4.
75
+ grid_cols (int, optional): Number of columns in the image grid. Default is 4.
76
+ output_dir (str, optional): Directory to save outputs. Default is './output'.
77
+ confidence_threshold (float, optional): YOLO detection confidence threshold. Default is 0.6.
78
+ search_budget (int, optional): Maximum number of frames to process during search. Default is 1000.
79
+ prefix (str, optional): Prefix for output filenames. Default is 'stitched_image'.
80
+ config_path (str, optional): Path to the YOLO configuration file. Default is None.
81
+ checkpoint_path (str, optional): Path to the YOLO model checkpoint. Default is None.
82
+ device (str, optional): Device for model inference (e.g., "cuda:0" or "cpu"). Default is "cuda:0".
83
+ """
84
+ self.video_path = video_path
85
+ self.yolo_scorer = heuristic_scorer
86
+ self.grounder = grounder
87
+ self.question = question
88
+ self.options = options
89
+ self.search_nframes = search_nframes
90
+ self.grid_rows = grid_rows
91
+ self.grid_cols = grid_cols
92
+ self.output_dir = output_dir
93
+ self.confidence_threshold = confidence_threshold
94
+ self.search_budget = search_budget
95
+ self.prefix = prefix
96
+ self.config_path = config_path
97
+ self.checkpoint_path = checkpoint_path
98
+ self.device = device
99
+
100
+ # Ensure the output directory exists
101
+ os.makedirs(self.output_dir, exist_ok=True)
102
+ logger.info("VideoSearcher initialized successfully.")
103
+
104
+ self.results = {}
105
+
106
+ def run(self):
107
+ """
108
+ Execute the complete video search and question-answering process.
109
+ """
110
+ # Use Grounder to get target and cue objects
111
+ target_objects, cue_objects = self.get_grounded_objects()
112
+
113
+ # Initialize TStarSearcher
114
+ video_searcher = TStarSearcher(
115
+ video_path=self.video_path,
116
+ target_objects=target_objects,
117
+ cue_objects=cue_objects,
118
+ search_nframes=self.search_nframes,
119
+ image_grid_shape=(self.grid_rows, self.grid_cols),
120
+ output_dir=self.output_dir,
121
+ confidence_threshold=self.confidence_threshold,
122
+ search_budget=self.search_budget,
123
+ prefix=self.prefix,
124
+ owl=self.yolo_scorer
125
+ )
126
+
127
+ logger.info(f"TStarSearcher initialized successfully for video {self.video_path}.")
128
+
129
+ # Perform search
130
+ all_frames, time_stamps = self.perform_search(video_searcher)
131
+
132
+ # Save retrieved frames
133
+ self.save_frames(all_frames, time_stamps)
134
+ self.save_searching_iters(video_searcher)
135
+ # Plot and save score distribution
136
+ self.plot_and_save_scores(video_searcher)
137
+
138
+ # Perform question-answering on retrieved frames
139
+ answer = self.perform_qa(all_frames)
140
+ print("QA Answer:", answer)
141
+
142
+ logger.info("VideoSearcher completed successfully.")
143
+
144
+ def get_grounded_objects(self) -> Tuple[List[str], List[str]]:
145
+ """
146
+ Use Grounder to obtain target and cue objects.
147
+
148
+ Returns:
149
+ Tuple[List[str], List[str]]: Lists of target objects and cue objects.
150
+ """
151
+ # Example code; should be implemented based on Grounder's interface
152
+ # For example:
153
+ target_objects, cue_objects = self.grounder.inference_query_grounding(
154
+ video_path=self.video_path,
155
+ question=self.question
156
+ )
157
+ # Here, assuming fixed target and cue objects
158
+ # target_objects = ["couch"] # Target objects to find
159
+ # cue_objects = ["TV", "chair"] # Cue objects
160
+
161
+ logger.info(f"Target objects: {target_objects}")
162
+ logger.info(f"Cue objects: {cue_objects}")
163
+ self.results["Searching_Objects"] = {"target_objects": target_objects, "cue_objects": cue_objects}
164
+ return target_objects, cue_objects
165
+
166
+ def perform_search(self, video_searcher: TStarSearcher) -> Tuple[List[np.ndarray], List[float]]:
167
+ """
168
+ Execute the frame search process and retrieve relevant frames and timestamps.
169
+
170
+ Args:
171
+ video_searcher (TStarSearcher): Instance of TStarSearcher.
172
+
173
+ Returns:
174
+ Tuple[List[np.ndarray], List[float]]: List of frames and their corresponding timestamps.
175
+ """
176
+ all_frames, time_stamps = video_searcher.search_with_visualization()
177
+ logger.info(f"Found {len(all_frames)} frames, timestamps: {time_stamps}")
178
+
179
+ self.results['timestamps'] = time_stamps
180
+ return all_frames, time_stamps
181
+
182
+ def perform_qa(self, frames: List[np.ndarray]) -> str:
183
+ """
184
+ Perform question-answering on the retrieved frames.
185
+
186
+ Args:
187
+ frames (List[np.ndarray]): List of frames to analyze.
188
+
189
+ Returns:
190
+ str: Answer generated by VLM.
191
+ """
192
+ answer = self.grounder.inference_qa(
193
+ frames=frames,
194
+ question=self.question,
195
+ options=self.options
196
+ )
197
+ self.results['answer'] = answer
198
+ return answer
199
+
200
+ def plot_and_save_scores(self, video_searcher: TStarSearcher):
201
+ """
202
+ Plot the score distribution and save the plot.
203
+
204
+ Args:
205
+ video_searcher (TStarSearcher): Instance of TStarSearcher.
206
+ """
207
+ plot_path = os.path.join(self.output_dir, "score_distribution.png")
208
+ video_searcher.plot_score_distribution(save_path=plot_path)
209
+ logger.info(f"Score distribution plot saved to {plot_path}")
210
+
211
+ def save_frames(self, frames: List[np.ndarray], timestamps: List[float]):
212
+ """
213
+ Save the retrieved frames as image files.
214
+
215
+ Args:
216
+ frames (List[np.ndarray]): List of frames to save.
217
+ timestamps (List[float]): Corresponding timestamps of the frames.
218
+ """
219
+ for idx, (frame, timestamp) in enumerate(zip(frames, timestamps)):
220
+ frame_path = os.path.join(
221
+ self.output_dir,
222
+ f"frame_{idx}_at_{timestamp:.2f}s.jpg"
223
+ )
224
+ cv2.imwrite(frame_path, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
225
+ logger.info(f"Saved frame to {frame_path}")
226
+
227
+ def save_searching_iters(self, video_searcher, video_ids=[]):
228
+ # # 定义 resize 操作,目标大小为 (640, 640)
229
+ # resize_transform = T.Resize((1024, 1024))
230
+ # resized_frames_tensor = resize_transform(resized_frames_tensor)
231
+
232
+ image_grid_iters = video_searcher.image_grid_iters # iters, b, image # b = 1 for v1
233
+ detect_annotot_iters = video_searcher.detect_annotot_iters # iters, b, image
234
+ detect_bbox_iters = video_searcher.detect_bbox_iters #iters, b, n_objects, xxyy,
235
+
236
+ fps = 1 # 设置帧率为 2
237
+ for b in range(len(image_grid_iters[0])):
238
+ images = [image_grid_iter[b] for image_grid_iter in image_grid_iters]
239
+ anno_images = [detect_annotot_iter[b] for detect_annotot_iter in detect_annotot_iters]
240
+
241
+ frame_size = (anno_images[0].shape[1], anno_images[0].shape[0]) # 获取图像大小 (宽度, 高度)
242
+
243
+ # 设置视频的参数
244
+ video_id=self.video_path.split("/")[-1].split(".")[0]
245
+ output_video_path = os.path.join(self.output_dir, f"{video_id}.gif") # 视频保存路径
246
+ save_as_gif(images=anno_images, output_gif_path=output_video_path)
247
+
248
+
249
+
250
+
251
+ def initialize_TStar_Scorer(
252
+ heuristic: str,
253
+ device: str
254
+ ) -> OWLInterface:
255
+ """
256
+ Initialize the YOLO object detection model.
257
+
258
+ Args:
259
+ config_path (str): Path to the YOLO configuration file.
260
+ checkpoint_path (str): Path to the YOLO model checkpoint.
261
+ device (str): Device for model inference (e.g., "cuda:0").
262
+
263
+ Returns:
264
+ YoloWorldInterface: Initialized YOLO interface instance.
265
+
266
+ Raises:
267
+ FileNotFoundError: If the configuration file or checkpoint file is not found.
268
+ """
269
+
270
+ model_choice = 'owl_model'
271
+ if model_choice == 'owl_model':
272
+ model_name="google/owlvit-base-patch32"
273
+ owl_interface = OWLInterface(
274
+ config_path = model_name,
275
+ checkpoint_path=None,
276
+ device="cuda:0"
277
+ )
278
+ logger.info("YoloWorldInterface initialized successfully.")
279
+ return owl_interface
280
+
281
+
282
+ def run_tstar(
283
+ video_path: str,
284
+ question: str,
285
+ options: str,
286
+ grounder: str,
287
+ heuristic: str,
288
+ openai_api_key: str,
289
+ device: str = "cuda:0",
290
+ search_nframes: int = 8,
291
+ grid_rows: int = 4,
292
+ grid_cols: int = 4,
293
+ confidence_threshold: float = 0.6,
294
+ search_budget: float = 0.5,
295
+ output_dir: str = './output',
296
+ ):
297
+ """
298
+ Executes the TStar video frame search and QA process.
299
+
300
+ Args:
301
+ video_path (str): Path to the input video file.
302
+ question (str): Question for video content QA.
303
+ options (str): Multiple-choice options for the question.
304
+ config_path (str): Path to the YOLO configuration file.
305
+ checkpoint_path (str): Path to the YOLO model checkpoint.
306
+ device (str): Device for model inference (e.g., "cuda:0" or "cpu").
307
+ search_nframes (int): Number of top frames to return.
308
+ grid_rows (int): Number of rows in the image grid.
309
+ grid_cols (int): Number of columns in the image grid.
310
+ confidence_threshold (float): YOLO detection confidence threshold.
311
+ search_budget (float): Maximum ratio of frames to process during search.
312
+ output_dir (str): Directory to save outputs.
313
+ prefix (str): Prefix for output filenames.
314
+
315
+ Returns:
316
+ dict: Results containing detected objects, timestamps, and the QA answer.
317
+ """
318
+ # Initialize Grounder and YOLO
319
+ grounder = TStarUniversalGrounder(backend="gpt4", model_name="gpt-4o", gpt4_api_key=openai_api_key)
320
+
321
+ TStar_Scorer = initialize_TStar_Scorer(
322
+ heuristic=heuristic,
323
+ device=device
324
+ )
325
+
326
+ # Initialize and run the search framework
327
+ searcher = TStarFramework(
328
+ grounder=grounder,
329
+ heuristic_scorer=TStar_Scorer,
330
+ video_path=video_path,
331
+ question=question,
332
+ options=options,
333
+ search_nframes=search_nframes,
334
+ grid_rows=grid_rows,
335
+ grid_cols=grid_cols,
336
+ output_dir=output_dir,
337
+ confidence_threshold=confidence_threshold,
338
+ search_budget=search_budget,
339
+ device=device
340
+ )
341
+ searcher.run()
342
+
343
+ return {
344
+ "Grounding Objects": searcher.results.get('Searching_Objects', []),
345
+ "Frame Timestamps": searcher.results.get('timestamps', []),
346
+ "Answer": searcher.results.get('answer', "No answer generated")
347
+ }
TStar/__init__.py ADDED
File without changes
TStar/__pycache__/TStarFramework.cpython-39.pyc ADDED
Binary file (11.2 kB). View file
 
TStar/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (127 Bytes). View file
 
TStar/__pycache__/interface_llm.cpython-39.pyc ADDED
Binary file (12.7 kB). View file
 
TStar/__pycache__/interface_searcher.cpython-39.pyc ADDED
Binary file (19.5 kB). View file
 
TStar/__pycache__/interface_yolo.cpython-39.pyc ADDED
Binary file (8.86 kB). View file
 
TStar/__pycache__/utils.cpython-39.pyc ADDED
Binary file (3.1 kB). View file
 
TStar/interface_llm.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import os
4
+ from tqdm import tqdm
5
+ from typing import Dict, Optional, Sequence, List
6
+ import transformers
7
+ import re
8
+
9
+ import openai
10
+ from typing import List, Dict
11
+ from PIL import Image
12
+ import base64
13
+ import io
14
+ try:
15
+ import cv2
16
+ except ImportError:
17
+ cv2 = None
18
+ print("Warning: OpenCV is not installed, video frame extraction will not work.")
19
+ from TStar.utils import *
20
+
21
+ class LlavaInterface:
22
+ """
23
+ 示例:封装对 Llava 模型的推理调用。
24
+ 关键在于对外暴露统一的方法 inference(query, frames, **kwargs)。
25
+ """
26
+ def __init__(self, model_path: str, model_base: Optional[str] = None):
27
+ # 这里是加载 Llava 模型等的逻辑
28
+ # self.tokenizer, self.model = ...
29
+ self.model_path = model_path
30
+ self.model_base = model_base
31
+ print(f"[LlavaInterface] model_path={model_path}, model_base={model_base}")
32
+
33
+ def inference(
34
+ self,
35
+ query: str,
36
+ frames: Optional[List[Image.Image]] = None,
37
+ system_message: str = "You are a helpful assistant.",
38
+ temperature: float = 0.2,
39
+ top_p: Optional[float] = None,
40
+ num_beams: int = 1,
41
+ max_tokens: int = 512,
42
+ **kwargs
43
+ ) -> str:
44
+ """
45
+ 对外暴露统一的推理接口。
46
+
47
+ query: 用户输入,可能包含文本+<image>标记
48
+ frames: 对应的图像帧列表
49
+ system_message: 系统提示
50
+ 其它参数根据需要自行添加
51
+ """
52
+
53
+ # 模拟推理逻辑,需要你自己实现
54
+ print("[LlavaInterface] Inference called with query:", query)
55
+ print("[LlavaInterface] frames count:", len(frames) if frames else 0)
56
+
57
+ # 真实场景下,你会调用 Llava 模型进行推理
58
+ return "Fake Response from LlavaInterface"
59
+
60
+ class GPT4Interface:
61
+ def __init__(self,model="gpt-4o", api_key=None):
62
+ """
63
+ Initialize the GPT-4 API client.
64
+
65
+ Reads the OpenAI API key from the environment variable `OPENAI_API_KEY`.
66
+ """
67
+ self.api_key = api_key
68
+ self.model_name = model
69
+ if api_key==None:
70
+ self.api_key = os.getenv("OPENAI_API_KEY")
71
+ if not self.api_key:
72
+ raise ValueError("Environment variable OPENAI_API_KEY is not set.")
73
+ openai.api_key = self.api_key
74
+
75
+ def inference_text_only(self, query: str, system_message: str = "You are a helpful assistant.", temperature: float = 0.7, max_tokens: int = 1000) -> str:
76
+ """
77
+ Perform inference using the GPT-4 API.
78
+
79
+ Args:
80
+ query (str): User's query or input.
81
+ system_message (str): System message to guide the model's behavior.
82
+ temperature (float): Sampling temperature for the response.
83
+ max_tokens (int): Maximum number of tokens for the response.
84
+
85
+ Returns:
86
+ str: The response generated by the GPT-4 model.
87
+ """
88
+ messages = [
89
+ {"role": "system", "content": system_message},
90
+ {"role": "user", "content": query},
91
+ ]
92
+
93
+ try:
94
+ response = openai.chat.completions.create(
95
+ model=self.model_name,
96
+ messages=messages,
97
+ temperature=temperature,
98
+ max_tokens=max_tokens,
99
+ )
100
+
101
+ return response.choices[0].message.content.strip()
102
+ except Exception as e:
103
+ return f"Error: {str(e)}"
104
+
105
+ def inference_with_frames(self, query: str, frames: List[Image.Image], system_message: str = "You are a helpful assistant.", temperature: float = 0.7, max_tokens: int = 1000) -> str:
106
+ """
107
+ Perform inference using the GPT-4 API with video frames as context.
108
+
109
+ Args:
110
+ query (str): User's query or input.
111
+ frames (List[Image.Image]): List of PIL.Image objects to provide visual context.
112
+ system_message (str): System message to guide the model's behavior.
113
+ temperature (float): Sampling temperature for the response.
114
+ max_tokens (int): Maximum number of tokens for the response.
115
+
116
+ Returns:
117
+ str: The response generated by the GPT-4 model.
118
+ """
119
+
120
+ # Messages format
121
+ inputs = [{"type": "text", "text": query}]
122
+
123
+ # Encode frames as Base64 strings
124
+ for i, frame in enumerate(frames):
125
+ try:
126
+ # Convert PIL Image to Base64 string
127
+ frame_base64 = encode_image_to_base64(frame)
128
+ visual_context = {
129
+ "type": "image_url",
130
+ "image_url": {
131
+ "url": f"data:image/jpeg;base64,{frame_base64}",
132
+ "detail": "low"
133
+ }
134
+ }
135
+ # Adding visual context (images) to messages if supported by the model
136
+ inputs.append(visual_context)
137
+
138
+ except Exception as e:
139
+ return f"Error encoding frame {i}: {str(e)}"
140
+
141
+ messages = [
142
+ {"role": "system", "content": system_message},
143
+ {"role": "user", "content": inputs},
144
+ ]
145
+ try:
146
+ response = openai.chat.completions.create(
147
+ model=self.model_name,
148
+ messages=messages,
149
+ temperature=temperature,
150
+ max_tokens=max_tokens
151
+ )
152
+ return response.choices[0].message.content.strip()
153
+ except Exception as e:
154
+ return f"Error: {str(e)}"
155
+
156
+
157
+
158
+ def inference_qa(self, question: str, options: str, frames: List[Image.Image] = None, system_message: str = "You are a helpful assistant.", temperature: float = 0.7, max_tokens: int = 500) -> str:
159
+ """
160
+ Perform inference for a multiple-choice question with optional visual frames as context.
161
+
162
+ Args:
163
+ question (str): The question to answer.
164
+ options (str): Multiple-choice options formatted as a string.
165
+ frames (List[Image.Image], optional): List of PIL.Image objects to provide additional visual context.
166
+ system_message (str): System message to guide the model's behavior.
167
+ temperature (float): Sampling temperature for the response.
168
+ max_tokens (int): Maximum number of tokens for the response.
169
+
170
+ Returns:
171
+ str: The selected option or answer.
172
+ """
173
+ # Construct query
174
+ query = f"Question: {question}\nOptions: {options}\nAnswer with the letter corresponding to the best choice."
175
+
176
+ # Messages format
177
+ inputs = [{"type": "text", "text": query}]
178
+
179
+ if frames:
180
+ # Encode frames as Base64 strings
181
+ for i, frame in enumerate(frames):
182
+ try:
183
+ frame_base64 = encode_image_to_base64(frame)
184
+ visual_context = {
185
+ "type": "image_url",
186
+ "image_url": {
187
+ "url": f"data:image/jpeg;base64,{frame_base64}",
188
+ "detail": "low"
189
+ }
190
+ }
191
+ # Adding visual context (images) to messages if supported by the model
192
+ inputs.append(visual_context)
193
+
194
+ except Exception as e:
195
+ return f"Error encoding frame {i}: {str(e)}"
196
+
197
+ messages = [
198
+ {"role": "system", "content": system_message},
199
+ {"role": "user", "content": inputs},
200
+ ]
201
+
202
+ try:
203
+ response = openai.chat.completions.create(
204
+ model="gpt-4o",
205
+ messages=messages,
206
+ temperature=temperature,
207
+ max_tokens=max_tokens
208
+ )
209
+ return response.choices[0].message.content.strip()
210
+ except Exception as e:
211
+ return f"Error: {str(e)}"
212
+
213
+ def inference_with_frames_all_in_one(self, query: str, frames: List[Image.Image], system_message: str = "You are a helpful assistant.", temperature: float = 0.7, max_tokens: int = 1000) -> str:
214
+ """
215
+ Perform inference using the GPT-4 API with video frames as context.
216
+ Args:
217
+ query (str): User's query or input. image tag: <image>
218
+ frames (List[Image.Image]): List of PIL.Image objects to provide visual context.
219
+ system_message (str): System message to guide the model's behavior.
220
+ temperature (float): Sampling temperature for the response.
221
+ max_tokens (int): Maximum number of tokens for the response.
222
+ Returns:
223
+ str: The response generated by the GPT-4 model.
224
+ """
225
+ # Split query by <image>
226
+ parts = query.split("<image>")
227
+ inputs = []
228
+
229
+ # Add text and images alternately to inputs
230
+ for i, part in enumerate(parts):
231
+ if part.strip():
232
+ inputs.append({"type": "text", "text": part.strip()})
233
+ if i < len(frames): # Ensure we don't exceed the number of available frames
234
+ try:
235
+ frame_base64 = encode_image_to_base64(frames[i])
236
+ visual_context = {
237
+ "type": "image_url",
238
+ "image_url": {
239
+ "url": f"data:image/jpeg;base64,{frame_base64}",
240
+ "detail": "low"
241
+ }
242
+ }
243
+ inputs.append(visual_context)
244
+ except Exception as e:
245
+ return f"Error encoding frame {i}: {str(e)}"
246
+
247
+ messages = [
248
+ {"role": "system", "content": system_message},
249
+ {"role": "user", "content": inputs},
250
+ ]
251
+
252
+ try:
253
+ response = openai.chat.completions.create(
254
+ model=self.model_name,
255
+ messages=messages,
256
+ temperature=temperature,
257
+ max_tokens=max_tokens
258
+ )
259
+ return response.choices[0].message.content.strip()
260
+ except Exception as e:
261
+ return f"Error: {str(e)}"
262
+
263
+ class TStarUniversalGrounder:
264
+ """
265
+ 结合了原先 TStarGrounder 与 TStarGPTGrounder 的功能,
266
+ 可以通过 backend 参数切换到底层使用的是 LlavaInterface 还是 GPT4Interface。
267
+ """
268
+ def __init__(
269
+ self,
270
+ backend: str = "gpt",
271
+ model_name: str = "gpt-4o",
272
+ model_path: Optional[str] = None,
273
+ model_base: Optional[str] = None,
274
+ gpt4_api_key: Optional[str] = None,
275
+ num_frames: Optional[int] = 8,
276
+ ):
277
+ """
278
+ backend: "llava" 或 "gpt4"
279
+ model_path, model_base: Llava 模型的路径及版本
280
+ gpt4_model_name, gpt4_api_key: GPT4 的模型名称及 API Key
281
+ """
282
+ self.backend = backend.lower()
283
+ self.num_frames = num_frames
284
+ if self.backend == "llava":
285
+ # 初始化 LlavaInterface
286
+ if not model_path:
287
+ raise ValueError("Please provide model_path for LlavaInterface")
288
+ self.VLM_model_interfance = LlavaInterface(model_path=model_path, model_base=model_base)
289
+ elif self.backend == "gpt4":
290
+ # 初始化 GPT4Interface
291
+ self.VLM_model_interfance = GPT4Interface(model=model_name, api_key=gpt4_api_key)
292
+ else:
293
+ raise ValueError("backend must be either 'llava' or 'gpt4'.")
294
+
295
+ def inference_query_grounding(
296
+ self,
297
+ video_path: str,
298
+ question: str,
299
+ options: Optional[str] = None,
300
+ temperature: float = 0.0,
301
+ max_tokens: int = 512
302
+ ) -> Dict[str, List[str]]:
303
+ """
304
+ 识别可作为答案依据的 target_objects 和可能辅助判断的 cue_objects。
305
+ """
306
+
307
+ frames = load_video_frames(video_path=video_path, num_frames=self.num_frames)
308
+ # 构建 prompt
309
+ system_prompt = (
310
+ "Here is a video:\n"
311
+ + "\n".join(["<image>"] * len(frames))
312
+ + "\nHere is a question about the video:\n"
313
+ f"Question: {question}\n"
314
+ )
315
+ if options:
316
+ system_prompt += f"Options: {options}\n"
317
+ system_prompt += (
318
+ "\nWhen answering this question about the video:\n"
319
+ "1. What key objects to locate the answer?\n"
320
+ " - List potential key objects (short sentences, separated by commas).\n"
321
+ "2. What cue objects might be near the key objects and might appear in the scenes?\n"
322
+ " - List potential cue objects (short sentences, separated by commas).\n\n"
323
+ "Please provide your answer in two lines, directly listing the key and cue objects, separated by commas."
324
+ )
325
+
326
+ # 统一走 self.interface.inference # need more abstract function
327
+ response = self.VLM_model_interfance.inference_with_frames_all_in_one(
328
+ query=system_prompt,
329
+ frames=frames,
330
+ temperature=temperature,
331
+ max_tokens=max_tokens,
332
+ )
333
+
334
+ # 根据预期格式解析响应
335
+ lines = response.split("\n")
336
+ if len(lines) < 2:
337
+ # print(response)
338
+ raise ValueError(f"Unexpected response format from inference_query_grounding() --> {response}.")
339
+
340
+ target_objects = [self.check_objects_str(obj) for obj in lines[0].split(",") if obj.strip()]
341
+ cue_objects = [self.check_objects_str(obj) for obj in lines[1].split(",") if obj.strip()]
342
+
343
+ return target_objects, cue_objects
344
+ def check_objects_str(self, obj: str):
345
+ obj = obj.lower() #小写
346
+ obj = obj.strip().replace("1. ", "")
347
+ obj = obj.strip().replace("2. ", "")
348
+ obj = obj.strip().replace(".", "")
349
+ obj = obj.strip().replace("key objects: ", "")
350
+ obj = obj.strip().replace("cue objects: ", "")
351
+ obj = obj.strip().replace(": ", "")
352
+
353
+ return obj
354
+
355
+
356
+
357
+
358
+ def inference_qa(
359
+ self,
360
+ frames: List[Image.Image],
361
+ question: str,
362
+ options: str,
363
+ temperature: float = 0.2,
364
+ max_tokens: int = 128
365
+ ) -> str:
366
+ """
367
+ 多选推理,返回最可能的选项(如 A、B、C、D)。
368
+ """
369
+ system_prompt = (
370
+ "Select the best answer to the following multiple-choice question based on the video.\n"
371
+ + "\n".join(["<image>"] * len(frames))
372
+ + f"\nQuestion: {question}\n"
373
+ + f"Options: {options}\n\n"
374
+ "Answer with the option’s letter from the given choices directly."
375
+ )
376
+
377
+ response = self.VLM_model_interfance.inference_with_frames_all_in_one(
378
+ query=system_prompt,
379
+ frames=frames,
380
+ temperature=temperature,
381
+ max_tokens=30
382
+ )
383
+ return response.strip()
384
+
385
+ def inference_openend_qa(
386
+ self,
387
+ frames: List[Image.Image],
388
+ question: str,
389
+ # options: str,
390
+ temperature: float = 0.2,
391
+ max_tokens: int = 2048
392
+ ) -> str:
393
+ """
394
+ 多选推���,返回最可能的选项(如 A、B、C、D)。
395
+ """
396
+ system_prompt = (
397
+ "Answer with the question in short based on the video.\n"
398
+ + "\n".join(["<image>"] * len(frames))
399
+ + f"\nQuestion: {question}\n"
400
+ )
401
+
402
+ response = self.VLM_model_interfance.inference_with_frames_all_in_one(
403
+ query=system_prompt,
404
+ frames=frames,
405
+ temperature=temperature,
406
+ max_tokens=30
407
+ )
408
+ return response.strip()
409
+
410
+ if __name__ == "__main__":
411
+ """
412
+ 测试示例。
413
+ """
414
+ # 1) 使用 Llava 作为底层模型
415
+ # print("=== Using Llava backend ===")
416
+ # llava_grounder = TStarUniversalGrounder(
417
+ # backend="llava",
418
+ # model_path="/path/to/llava",
419
+ # model_base="v1.0"
420
+ # )
421
+ frames_fake = [ # 随记噪声更好
422
+ Image.open("./output_image.jpg"),
423
+ Image.open("/home/yejinhui/Projects/VisualSearch/output_image.jpg")
424
+ ]
425
+ # result_grounding_llava = llava_grounder.inference_query_grounding(
426
+ # frames=frames_fake,
427
+ # question="What objects are in the video?",
428
+ # )
429
+ # print("Llava Grounding Result:", result_grounding_llava)
430
+
431
+ # 2) 使用 GPT-4 作为底层模型
432
+ print("\n=== Using GPT-4 backend ===")
433
+ gpt4_grounder = TStarUniversalGrounder(
434
+ backend="gpt4",
435
+ model_name="gpt-4o",
436
+ gpt4_api_key=None
437
+ )
438
+ searchable_objects = gpt4_grounder.inference_query_grounding(
439
+ frames=frames_fake,
440
+ question="What objects are in the video?"
441
+ )
442
+ print("GPT-4 Grounding Result:", searchable_objects)
443
+
444
+ # 3) 多选问答示例
445
+ question_mc = "How many cats can be seen?\n"
446
+ options_mc = "A) 0\nB) 1\nC) 2\nD) 3\n"
447
+ # answer_llava = llava_grounder.inference_qa(frames_fake, question_mc, options_mc)
448
+ # print("Llava QA Answer:", answer_llava)
449
+
450
+ answer_gpt4 = gpt4_grounder.inference_qa(frames_fake, question_mc, options_mc)
451
+ print("GPT-4 QA Answer:", answer_gpt4)
TStar/interface_owl.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from transformers import OwlViTProcessor, OwlViTForObjectDetection
3
+ import cv2
4
+ import numpy as np
5
+ from PIL import Image
6
+ import torch
7
+ import supervision as sv
8
+ from torch.cuda.amp import autocast
9
+ class owlInterface:
10
+ def __init__(self):
11
+ """
12
+ Initialize the YOLO-World model with the given configuration and checkpoint.
13
+
14
+ Args:
15
+ """
16
+
17
+
18
+ pass
19
+
20
+ class OWLInterface(owlInterface):
21
+ # def __init__(self, model_name="google/owlvit-base-patch32"):
22
+ def __init__(self, config_path: str, checkpoint_path: None, device: str = "cuda:0"):
23
+ self.processor, self.model = self.load_model_and_tokenizer(config_path)
24
+ self.device = device
25
+ self.model = self.model.to(self.device)
26
+ self.texts = ["couch", "table", "woman"]
27
+
28
+ def load_model_and_tokenizer(self, model_name):
29
+ processor = OwlViTProcessor.from_pretrained(model_name)
30
+ model = OwlViTForObjectDetection.from_pretrained(model_name)
31
+ return processor, model
32
+
33
+ def forward_model(self, inputs):
34
+ with torch.no_grad():
35
+ outputs = self.model(**inputs)
36
+ return outputs
37
+
38
+ def inference(self, image_path, use_amp: bool = False):
39
+ with Image.open(image_path).convert("RGB") as image:
40
+ width, height = image.size
41
+ inputs = self.processor(text=self.texts, images=image, return_tensors="pt").to(self.device)
42
+
43
+ # Run model inference
44
+ outputs = self.forward_model(inputs)
45
+
46
+ # Post-process outputs
47
+ target_size = torch.tensor([[height, width]])
48
+ results = self.processor.post_process_grounded_object_detection(
49
+ outputs=outputs, target_sizes=target_size)[0]
50
+ detections = sv.Detections.from_transformers(transformers_results=results)
51
+ return detections
52
+
53
+ def inference_detector(self, images, use_amp: bool = False):
54
+ # batch_images = []
55
+ # for i in range(4): #@Jinhui why there are hard code and why four? bug!!
56
+ # for j in range(4):
57
+ # # Extract the smaller image from the grid
58
+ # small_image = image[i*120:(i+1)*120, j*160:(j+1)*160]
59
+ # batch_images.append(small_image)
60
+ # # batch_images = np.array(batch_images)
61
+ # images = [Image.fromarray(np.uint8(img)).convert("RGB") for img in batch_images]
62
+ batch_images = np.array(images)
63
+ inputs = self.processor(text= self.texts, images=batch_images[0], return_tensors="pt").to(self.device)
64
+ height, width = batch_images[0].shape[:2]
65
+ detections_inbatch = []
66
+ with torch.no_grad():
67
+ # Run model inference
68
+ outputs = self.forward_model(inputs)
69
+
70
+ target_sizes = torch.tensor([[height, width] for i in batch_images])
71
+ results = self.processor.post_process_grounded_object_detection(
72
+ outputs=outputs, target_sizes=target_sizes, threshold=0.05)
73
+ for result in results:
74
+ detections = sv.Detections.from_transformers(transformers_results=result)
75
+ detections_inbatch.append(detections)
76
+
77
+
78
+ check = True
79
+ if check:
80
+ # save first image for checking
81
+ bounding_box_annotator = sv.BoxAnnotator()
82
+ annotated_image = bounding_box_annotator.annotate(batch_images[0] , detections_inbatch[0])
83
+
84
+ output_image = Image.fromarray(annotated_image[:, :, ::-1])
85
+ output_image.save("./annotated_image.png")
86
+ self.detections_inbatch = detections_inbatch
87
+ return detections_inbatch
88
+
89
+ def bbox_visualization(self, images, detections_inbatch):
90
+ # image = Image.open(image_path).convert("RGB")
91
+ # output_image.save(output_path)
92
+ # detections = self.inference(images)
93
+ # Annotate image
94
+ # detections = self.inference(images)
95
+ bounding_box_annotator = sv.BoxAnnotator()
96
+ annotated_images = []
97
+ for image, detections in zip(images,detections_inbatch):
98
+ annotated_image = bounding_box_annotator.annotate(image, detections)
99
+ # output_image = Image.fromarray(annotated_image[:, :, ::-1])
100
+ annotated_images.append(annotated_image)
101
+
102
+ return annotated_images
103
+ def reparameterize_object_list(self, target_objects: List[str], cue_objects: List[str]):
104
+ """
105
+ Reparameterize the detect object list to be used by the OWL model.
106
+
107
+ Args:
108
+ target_objects (List[str]): List of target object names.
109
+ cue_objects (List[str]): List of cue object names.
110
+ """
111
+ # Combine target objects and cue objects into the final text format
112
+ combined_texts = target_objects + cue_objects
113
+
114
+ # Format the text prompts for the YOLO model
115
+ self.texts = [[obj.strip()] for obj in combined_texts] + [[' ']]
116
+
117
+ # Reparameterize the YOLO model with the provided text prompts
118
+ # self.model.reparameterize(self.texts)
119
+
120
+ def main():
121
+ model_choice = 'owl_model'
122
+
123
+ image_path = "/home/anabella/projects/MLLM/TSTAR/data/score/annotated_image.png"
124
+ output_path = "/home/anabella/projects/MLLM/TSTAR/data/score/annotated_image3.png"
125
+
126
+ if model_choice == 'owl_model':
127
+ model_name="google/owlvit-base-patch32"
128
+ owl_interface = OWLInterface(
129
+ config_path = model_name,
130
+ checkpoint_path=None,
131
+ device="cuda:0"
132
+ )
133
+ owl_interface.bbox_visualization(image_path, output_path)
134
+
135
+ if __name__ == "__main__":
136
+ main()
TStar/interface_searcher.py ADDED
@@ -0,0 +1,704 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from dataclasses import dataclass, field
5
+ from typing import List, Optional, Tuple
6
+ from decord import VideoReader, cpu
7
+ from scipy.interpolate import UnivariateSpline
8
+ import copy
9
+ from tqdm import tqdm
10
+ import os
11
+ import sys
12
+ import cv2
13
+ import copy
14
+ import logging
15
+
16
+ # Assuming YoloWorldInterface is defined elsewhere and imported correctly
17
+ # from your_project.yolo_interface import YoloWorldInterface
18
+ # 导入自定义的 TStar 接口
19
+ # from TStar.interface_yolo import YoloWorldInterface, YoloV5Interface, YoloInterface
20
+ from .interface_owl import OWLInterface, owlInterface
21
+ class TStarSearcher:
22
+ """
23
+ A class to perform keyframe search in a video using object detection and dynamic sampling.
24
+
25
+ Attributes:
26
+ video_path (str): Path to the video file.
27
+ target_objects (List[str]): List of target objects to find.
28
+ cue_objects (List[str]): List of cue objects for context.
29
+ confidence_threshold (float): Minimum confidence threshold for object detection.
30
+ search_nframes (int): Number of keyframes to search for.
31
+ image_grid_shape (Tuple[int, int]): Shape of the image grid for detection.
32
+ output_dir (Optional[str]): Directory to save outputs.
33
+ profix (str): Prefix for output files.
34
+ object2weight (dict): Weights assigned to specific objects.
35
+ raw_fps (float): Original frames per second of the video.
36
+ total_frame_num (int): Total number of frames adjusted for sampling rate.
37
+ duration (float): Duration of the video in seconds.
38
+ remaining_targets (List[str]): Targets yet to be found.
39
+ search_budget (int): Budget for the number of frames to process.
40
+ score_distribution (np.ndarray): Scores assigned to each frame.
41
+ P_history (List[List[float]]): History of probability distributions.
42
+ non_visiting_frames (np.ndarray): Indicator for frames not yet visited.
43
+ yolo (YoloWorldInterface): YOLO interface for object detection.
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ video_path: str,
49
+ target_objects: List[str],
50
+ cue_objects: List[str],
51
+ search_nframes: int = 8,
52
+ image_grid_shape: Tuple[int, int] = (8, 8),
53
+ search_budget: float = 0.1,
54
+ output_dir: Optional[str] = None,
55
+ prefix: str = None,
56
+ confidence_threshold: float = 0.5,
57
+ object2weight: Optional[dict] = None,
58
+ model_choice: str ="owl",
59
+ owl = None,
60
+ ):
61
+ """
62
+ Initializes the TStarSearcher object with video properties and configurations.
63
+
64
+ Args:
65
+ video_path (str): Path to the input video file.
66
+ target_objects (List[str]): List of objects to detect as primary targets.
67
+ cue_objects (List[str]): List of contextual objects to aid detection.
68
+ cue_object (Optional[str]): A single cue object for additional focus.
69
+ search_nframes (int): Number of keyframes to identify.
70
+ image_grid_shape (Tuple[int, int]): Grid dimensions for image tiling.
71
+ output_dir (Optional[str]): Directory to store results.
72
+ profix (str): Prefix for saved output files.
73
+ confidence_threshold (float): Threshold for object detection confidence.
74
+ object2weight (Optional[dict]): Mapping of objects to their respective detection weights.
75
+ config_path (str): Path to the YOLO configuration file.
76
+ checkpoint_path (str): Path to the YOLO model checkpoint.
77
+ device (str): Device for model inference (e.g., "cuda:0").
78
+ """
79
+ self.video_path = video_path
80
+ self.target_objects = target_objects
81
+ self.cue_objects = cue_objects
82
+ self.search_nframes = search_nframes
83
+ self.image_grid_shape = image_grid_shape
84
+ self.output_dir = output_dir
85
+ self.profix = prefix
86
+ self.confidence_threshold = confidence_threshold
87
+ self.object2weight = object2weight if object2weight else {}
88
+ self.fps = 1 # Sampling at 1 fps # TODO look at this
89
+ self.model_choice = model_choice
90
+ # Video properties
91
+ cap = cv2.VideoCapture(self.video_path)
92
+ if not cap.isOpened():
93
+ raise ValueError(f"Cannot open video file: {self.video_path}")
94
+ self.raw_fps = cap.get(cv2.CAP_PROP_FPS)
95
+ self.total_frame_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
96
+ self.duration = self.total_frame_num / self.raw_fps
97
+
98
+ # Adjust total frame number based on sampling rate
99
+ self.total_frame_num = int(self.duration * self.fps)
100
+ self.remaining_targets = target_objects.copy()
101
+ self.search_budget = min(1000, self.total_frame_num*search_budget)
102
+
103
+ # Initialize distributions
104
+ self.score_distribution = np.zeros(self.total_frame_num)
105
+ self.P_history = []
106
+ self.non_visiting_frames = np.ones(self.total_frame_num)
107
+ self.P = np.ones(self.total_frame_num) * self.confidence_threshold * 0.3
108
+
109
+ # Initialize YOLO interface TODO : allow for YOLOV5
110
+ self.owl = None
111
+ # self.reset_yolo_vocabulary(target_objects=target_objects, cue_objects=cue_objects)
112
+ for object in target_objects:
113
+ self.object2weight[object] = 1.0
114
+ for object in cue_objects:
115
+ self.object2weight[object] = 0.5
116
+ #TODO: put in if statement
117
+ # Initialize OWL interface
118
+ model_name="google/owlvit-base-patch32"
119
+ self.owl = owl
120
+ # self.owl = OWLInterface(
121
+ # config_path = model_name,
122
+ # checkpoint_path=None,
123
+ # device="cuda:0"
124
+ # )
125
+
126
+ # self.reset_owl_vocabulary(target_objects=target_objects, cue_objects=cue_objects)
127
+ for object in target_objects:
128
+ self.object2weight[object] = 1.0
129
+ for object in cue_objects:
130
+ self.object2weight[object] = 0.5
131
+
132
+
133
+ def reset_yolo_vocabulary(self, target_objects: List[str], cue_objects: List[str]):
134
+ """
135
+ Dynamically resets the YOLO vocabulary with the specified target and cue objects.
136
+
137
+ Args:
138
+ target_objects (List[str]): New list of target objects for detection.
139
+ cue_objects (List[str]): New list of cue objects for detection context.
140
+ """
141
+ self.target_objects = target_objects
142
+ self.cue_objects = cue_objects
143
+ self.owl.reparameterize_object_list(target_objects, cue_objects)
144
+
145
+ ### --- Detection Methods --- ###
146
+
147
+ def imageGridScoreFunction(
148
+ self,
149
+ images: List[np.ndarray],
150
+ output_dir: Optional[str],
151
+ image_grids: Tuple[int, int]
152
+ ) -> Tuple[np.ndarray, List[List[List[str]]]]:
153
+ """
154
+ Perform object detection on a batch of images using the YOLO interface.
155
+
156
+ Args:
157
+ images (List[np.ndarray]): List of images to process.
158
+ output_dir (Optional[str]): Directory to save detection results.
159
+ image_grids (Tuple[int, int]): Dimensions of the image grid (rows, cols).
160
+
161
+ Returns:
162
+ Tuple[np.ndarray, List[List[List[str]]]]: Confidence maps and detected object lists.
163
+ - confidence_maps: numpy array of shape (num_images, grid_rows, grid_cols)
164
+ - detected_objects_maps: list of lists, each sublist corresponds to a grid_image and contains detected objects per cell
165
+ """
166
+ if len(images) == 0:
167
+ return np.array([]), []
168
+
169
+ grid_rows, grid_cols = image_grids
170
+ grid_height = images[0].shape[0] / grid_rows
171
+ grid_width = images[0].shape[1] / grid_cols
172
+
173
+ confidence_maps = []
174
+ detected_objects_maps = []
175
+
176
+ # Perform detection on all images --AI
177
+ for image in images:
178
+ if self.model_choice == "yolo":
179
+ # Run the YOLO inference
180
+ detections = self.owl.inference_detector(
181
+ images=[image], # Single image as a batch
182
+ max_dets=50,
183
+ use_amp=False
184
+ )
185
+ elif self.model_choice == "owl":
186
+ # Run the OWL inference
187
+ detections = self.owl.inference_detector(
188
+ images=[image], # Single image as a batch
189
+ use_amp=False
190
+ )
191
+
192
+ # Initialize confidence map and detected objects map
193
+ confidence_map = np.zeros((grid_rows, grid_cols))
194
+ detected_objects_map = [[] for _ in range(grid_rows * grid_cols)]
195
+
196
+ # Process detections
197
+ for detection in detections:
198
+ for bbox, label, confidence in zip(detection.xyxy, detection.class_id, detection.confidence):
199
+ # Convert class ID to object name
200
+ if self.model_choice == "yolo":
201
+ object_name = self.owl.texts[label][0] #@Jinhui TBD for YOLOWorld
202
+ elif self.model_choice == "owl":
203
+ object_name = self.owl.texts[label][0]
204
+
205
+ # Apply object weight if available
206
+ weight = self.object2weight.get(object_name, 0.5)
207
+ adjusted_confidence = confidence * weight
208
+
209
+ # Calculate bounding box center
210
+ x_min, y_min, x_max, y_max = bbox
211
+ box_center_x = (x_min + x_max) / 2
212
+ box_center_y = (y_min + y_max) / 2
213
+
214
+ # Map center to grid cell
215
+ grid_x = int(box_center_x // grid_width)
216
+ grid_y = int(box_center_y // grid_height)
217
+
218
+ # Ensure grid indices are valid
219
+ grid_x = min(grid_x, grid_cols - 1)
220
+ grid_y = min(grid_y, grid_rows - 1)
221
+
222
+ # Update confidence map and detected objects
223
+ cell_index = grid_y * grid_cols + grid_x
224
+ confidence_map[grid_y, grid_x] = max(confidence_map[grid_y, grid_x], adjusted_confidence)
225
+ detected_objects_map[cell_index].append(object_name)
226
+
227
+ confidence_maps.append(confidence_map)
228
+ detected_objects_maps.append(detected_objects_map)
229
+
230
+ return np.stack(confidence_maps), detected_objects_maps
231
+
232
+ def read_frame_batch(self, video_path: str, frame_indices: List[int]) -> Tuple[List[int], np.ndarray]:
233
+ """
234
+ Reads a batch of frames from the video at specified indices.
235
+
236
+ Args:
237
+ video_path (str): Path to the video file.
238
+ frame_indices (List[int]): Indices of frames to read.
239
+
240
+ Returns:
241
+ Tuple[List[int], np.ndarray]: List of indices and corresponding frame array.
242
+ """
243
+ vr = VideoReader(video_path, ctx=cpu(0))
244
+ return frame_indices, vr.get_batch(frame_indices).asnumpy()
245
+
246
+ def create_image_grid(self, frames: List[np.ndarray], rows: int, cols: int) -> np.ndarray:
247
+ """
248
+ Combine frames into a single image grid.
249
+
250
+ Args:
251
+ frames (List[np.ndarray]): List of frame images.
252
+ rows (int): Number of rows in the grid.
253
+ cols (int): Number of columns in the grid.
254
+
255
+ Returns:
256
+ np.ndarray: Combined image grid.
257
+ """
258
+ if len(frames) != rows * cols:
259
+ raise ValueError("Frame count does not match grid dimensions")
260
+
261
+ # Resize frames to fit the grid
262
+ resized_frames = [cv2.resize(frame, (160, 120)) for frame in frames] # Resize to 160x120
263
+ grid_rows = [np.hstack(resized_frames[i * cols:(i + 1) * cols]) for i in range(rows)]
264
+ return np.vstack(grid_rows)
265
+
266
+ ### --- Scoring Methods --- ###
267
+
268
+ def score_image_grids(
269
+ self,
270
+ images: List[np.ndarray],
271
+ image_grids: Tuple[int, int]
272
+ ) -> Tuple[np.ndarray, List[List[List[str]]]]:
273
+ """
274
+ Generate confidence maps and detected objects for each image grid.
275
+
276
+ Args:
277
+ images (List[np.ndarray]): List of image grids to detect objects.
278
+ image_grids (Tuple[int, int]): Grid dimensions (rows, cols).
279
+
280
+ Returns:
281
+ Tuple[np.ndarray, List[List[List[str]]]]: Confidence maps and detected objects maps.
282
+ """
283
+ return self.imageGridScoreFunction(
284
+ images=images,
285
+ output_dir=self.output_dir,
286
+ image_grids=image_grids
287
+ )
288
+
289
+
290
+ def store_score_distribution(self):
291
+ """
292
+ Stores a copy of the current probability distribution to the history.
293
+ """
294
+ self.P_history.append(copy.deepcopy(self.P).tolist())
295
+
296
+ def update_top_25_with_window(
297
+ self,
298
+ frame_confidences: List[float],
299
+ sampled_frame_indices: List[int],
300
+ window_size: int = 5
301
+ ):
302
+ """
303
+ Update score distribution for top 25% frames and their neighbors.
304
+
305
+ Args:
306
+ frame_confidences (List[float]): Confidence scores for sampled frames.
307
+ sampled_frame_indices (List[int]): Corresponding frame indices.
308
+ window_size (int): Number of neighboring frames to update.
309
+ """
310
+ # Calculate the threshold for top 25%
311
+ top_25_threshold = np.percentile(frame_confidences, 75)
312
+
313
+ # Identify top 25% frames
314
+ top_25_indices = [
315
+ frame_idx for frame_idx, confidence in zip(sampled_frame_indices, frame_confidences)
316
+ if confidence >= top_25_threshold
317
+ ]
318
+
319
+ # Update neighboring frames
320
+ for frame_idx in top_25_indices:
321
+ for offset in range(-window_size, window_size + 1):
322
+ neighbor_idx = frame_idx + offset
323
+ if 0 <= neighbor_idx < len(self.score_distribution):
324
+ self.score_distribution[neighbor_idx] = max(
325
+ self.score_distribution[neighbor_idx],
326
+ self.score_distribution[frame_idx]/(abs(offset) + 1)
327
+ )
328
+
329
+ def spline_keyframe_distribution(
330
+ self,
331
+ non_visiting_frames: np.ndarray,
332
+ score_distribution: np.ndarray,
333
+ video_length: int
334
+ ) -> np.ndarray:
335
+ """
336
+ Generate a probability distribution over frames using spline interpolation.
337
+
338
+ Args:
339
+ non_visiting_frames (np.ndarray): Indicator array for frames not yet visited.
340
+ score_distribution (np.ndarray): Current score distribution over frames.
341
+ video_length (int): Total number of frames.
342
+
343
+ Returns:
344
+ np.ndarray: Normalized probability distribution over frames.
345
+ """
346
+ # Extract indices and scores of visited frames
347
+ frame_indices = np.array([idx for idx, visited in enumerate(non_visiting_frames) if visited == 0])
348
+ observed_scores = np.array([score_distribution[idx] for idx in frame_indices])
349
+
350
+ # If no frames have been visited, return uniform distribution
351
+ if len(frame_indices) == 0:
352
+ return np.ones(video_length) / video_length
353
+
354
+ # Spline interpolation
355
+ spline = UnivariateSpline(frame_indices, observed_scores, s=0.5)
356
+ all_frames = np.arange(video_length)
357
+ spline_scores = spline(all_frames)
358
+
359
+ # Apply sigmoid function
360
+ def sigmoid(x):
361
+ return 1 / (1 + np.exp(-x))
362
+
363
+ adjusted_scores = np.maximum(1 / video_length, spline_scores)
364
+ p_distribution = sigmoid(adjusted_scores)
365
+
366
+ # Normalize the distribution
367
+ p_distribution /= p_distribution.sum()
368
+
369
+ return p_distribution
370
+
371
+ def update_frame_distribution(
372
+ self,
373
+ sampled_frame_indices: List[int],
374
+ confidence_maps: np.ndarray,
375
+ detected_objects_maps: List[List[List[str]]]
376
+ ) -> Tuple[List[float], List[List[str]]]:
377
+ """
378
+ Update the frame distribution based on detection results.
379
+
380
+ Args:
381
+ sampled_frame_indices (List[int]): Indices of sampled frames.
382
+ confidence_maps (np.ndarray): Confidence maps from detection.
383
+ detected_objects_maps (List[List[List[str]]]): Detected objects from detection.
384
+
385
+ Returns:
386
+ Tuple[List[float], List[List[str]]]: Frame confidences and detected objects.
387
+ """
388
+ confidence_map = confidence_maps[0] # Only one image grid @TBD
389
+ detected_objects_map = detected_objects_maps[0]
390
+
391
+ grid_rows, grid_cols = self.image_grid_shape
392
+
393
+ frame_confidences = []
394
+ frame_detected_objects = []
395
+ for idx, frame_idx in enumerate(sampled_frame_indices):
396
+ # Calculate grid cell position
397
+ row = idx // grid_cols
398
+ col = idx % grid_cols
399
+ confidence = confidence_map[row, col]
400
+ detected_objects = detected_objects_map[idx]
401
+ frame_confidences.append(confidence)
402
+ frame_detected_objects.append(detected_objects)
403
+
404
+ # Update non-visiting frames and score distribution
405
+ for frame_idx, confidence in zip(sampled_frame_indices, frame_confidences):
406
+ self.non_visiting_frames[frame_idx] = 0 # Mark as visited
407
+ self.score_distribution[frame_idx] = confidence
408
+
409
+ # Update top 25% frames
410
+ self.update_top_25_with_window(frame_confidences, sampled_frame_indices)
411
+
412
+ # Update probability distribution
413
+ self.P = self.spline_keyframe_distribution(
414
+ self.non_visiting_frames,
415
+ self.score_distribution,
416
+ len(self.score_distribution)
417
+ )
418
+
419
+ # Store the updated distribution
420
+ self.store_score_distribution()
421
+
422
+ return frame_confidences, frame_detected_objects
423
+
424
+ ### --- Sampling Methods --- ###
425
+
426
+ def sample_frames(self, num_samples: int) -> Tuple[List[int], np.ndarray]:
427
+ """
428
+ Sample frames based on the current score distribution.
429
+
430
+ Args:
431
+ num_samples (int): Number of frames to sample.
432
+
433
+ Returns:
434
+ Tuple[List[int], np.ndarray]: Sampled frame indices and frame data.
435
+ """
436
+ if num_samples > self.total_frame_num:
437
+ num_samples = self.total_frame_num
438
+
439
+ # Adjust probabilities for non-visited frames
440
+ _P = (self.P + num_samples / self.total_frame_num) * self.non_visiting_frames
441
+ _P /= _P.sum()
442
+
443
+ # Sample frames
444
+ sampled_frame_secs = np.random.choice(
445
+ self.total_frame_num,
446
+ size=num_samples,
447
+ replace=False,
448
+ p=_P
449
+ )
450
+ sampled_frame_indices = [int(sec * self.raw_fps / self.fps) for sec in sampled_frame_secs]
451
+
452
+ # Read frames
453
+ frame_indices, frames = self.read_frame_batch(
454
+ video_path=self.video_path,
455
+ frame_indices=sampled_frame_indices
456
+ )
457
+
458
+ return sampled_frame_secs.tolist(), frames
459
+
460
+ ### --- Verification Methods --- ###
461
+
462
+ def verify_and_remove_target(
463
+ self,
464
+ frame_sec: int,
465
+ detected_objects: List[str],
466
+ confidence_threshold: float,
467
+ ) -> bool:
468
+ """
469
+ Verify target object detection in an individual frame and remove it from the target list if confirmed.
470
+
471
+ Args:
472
+ frame_sec (int): The timestamp of the frame in seconds.
473
+ detected_objects (List[str]): Objects detected in the grid image for this frame.
474
+ confidence_threshold (float): Threshold to confirm target detection.
475
+
476
+ Returns:
477
+ bool: True if a target was found and removed, False otherwise.
478
+ """
479
+ for target in list(self.remaining_targets):
480
+ if target in detected_objects:
481
+ frame_idx = int(frame_sec * self.raw_fps / self.fps)
482
+ # Read the individual frame
483
+ _, frame = self.read_frame_batch(self.video_path, [frame_idx])
484
+ frame = frame[0] # Extract the frame from the list
485
+
486
+ # Perform detection on the individual frame
487
+ single_confidence_maps, single_detected_objects_maps = self.score_image_grids(
488
+ [frame], (1, 1)
489
+ )
490
+ single_confidence = single_confidence_maps[0, 0, 0]
491
+ single_detected_objects = single_detected_objects_maps[0][0]
492
+ self.score_distribution[frame_sec] = single_confidence
493
+
494
+ # Check if target object confidence exceeds the threshold
495
+ if target in single_detected_objects and single_confidence > confidence_threshold:
496
+ self.remaining_targets.remove(target)
497
+ print(f"Found target '{target}' in frame {frame_idx}, score {single_confidence:.2f}")
498
+ self.image_grid_iters.append([frame])
499
+ self.detect_annotot_iters.append(self.owl.bbox_visualization(images=[frame], detections_inbatch=self.owl.detections_inbatch))
500
+ self.detect_bbox_iters.append(self.owl.detections_inbatch)
501
+
502
+ return True
503
+
504
+ return False
505
+
506
+ ### --- Visualization Methods --- ###
507
+
508
+ def plot_score_distribution(self, save_path: Optional[str] = None):
509
+ """
510
+ Plot the score distribution over time.
511
+
512
+ Args:
513
+ save_path (Optional[str]): File path to save the plot.
514
+ """
515
+ time_axis = np.linspace(0, self.duration, len(self.score_distribution))
516
+
517
+ plt.figure(figsize=(12, 6))
518
+ plt.plot(time_axis, self.score_distribution, label="Score Distribution")
519
+ plt.xlabel("Time (seconds)")
520
+ plt.ylabel("Score")
521
+ plt.title("Score Distribution Over Time")
522
+ plt.grid(True)
523
+ plt.legend()
524
+
525
+ if save_path:
526
+ plt.savefig(save_path, format='png', dpi=300)
527
+ print(f"Plot saved to {save_path}")
528
+
529
+ plt.show()
530
+
531
+ ### --- Main Search Logic --- ###
532
+
533
+ def search(self) -> Tuple[List[np.ndarray], List[float]]:
534
+ """
535
+ Perform the keyframe search based on object detection and dynamic sampling.
536
+
537
+ Returns:
538
+ Tuple[List[np.ndarray], List[float]]: Extracted keyframes and their timestamps.
539
+ """
540
+ K = self.search_nframes # Number of keyframes to find
541
+ # Estimate the total number of iterations based on search_budget and frames per iteration
542
+ video_length = int(self.total_frame_num)
543
+
544
+ # Initialize tqdm progress bar
545
+ progress_bar = tqdm(total=video_length, desc="Searching Iterations / video_length", unit="iter", dynamic_ncols=True)
546
+
547
+ while self.remaining_targets and self.search_budget > 0:
548
+ grid_rows, grid_cols = self.image_grid_shape
549
+ num_frames_in_grid = grid_rows * grid_cols
550
+
551
+ # Sample frames based on the current distribution
552
+ sampled_frame_secs, frames = self.sample_frames(num_frames_in_grid)
553
+ self.search_budget -= num_frames_in_grid
554
+
555
+ # Create an image grid from the sampled frames
556
+ grid_image = self.create_image_grid(frames, grid_rows, grid_cols)
557
+
558
+ # Perform object detection on the image grid
559
+ confidence_maps, detected_objects_maps = self.score_image_grids(
560
+ images=[grid_image],
561
+ image_grids=self.image_grid_shape
562
+ )
563
+
564
+ # Update frame distributions based on detection results
565
+ frame_confidences, frame_detected_objects = self.update_frame_distribution(
566
+ sampled_frame_indices=sampled_frame_secs,
567
+ confidence_maps=confidence_maps,
568
+ detected_objects_maps=detected_objects_maps
569
+ )
570
+
571
+ # Verify and remove detected targets
572
+ for frame_sec, detected_objects in zip(sampled_frame_secs, frame_detected_objects):
573
+ self.verify_and_remove_target(
574
+ frame_sec=frame_sec,
575
+ detected_objects=detected_objects,
576
+ confidence_threshold=self.confidence_threshold,
577
+ )
578
+ # Update the progress bar
579
+ progress_bar.update(1)
580
+
581
+ # Close the progress bar once the loop is done
582
+ progress_bar.close()
583
+ # Select top K frames based on the score distribution
584
+ top_k_indices = np.argsort(self.score_distribution)[-K:][::-1]
585
+ top_k_frames = []
586
+ time_stamps = []
587
+
588
+ # Read and store the top K frames
589
+ for idx in top_k_indices:
590
+ frame_idx = int(idx * self.raw_fps / self.fps)
591
+ _, frame = self.read_frame_batch(self.video_path, [frame_idx])
592
+ top_k_frames.append(frame[0])
593
+ time_stamps.append(idx / self.fps)
594
+
595
+ return top_k_frames, time_stamps
596
+
597
+
598
+
599
+ def search_with_visualization(self) -> Tuple[List[np.ndarray], List[float]]:
600
+ """
601
+ Perform the keyframe search based on object detection and dynamic sampling.
602
+
603
+ Returns:
604
+ Tuple[List[np.ndarray], List[float]]: Extracted keyframes and their timestamps.
605
+ """
606
+
607
+
608
+ # Initialize history
609
+ self.image_grid_iters = [] # iters, b, image
610
+ self.detect_annotot_iters = [] # iters, b, image
611
+ self.detect_bbox_iters = [] #iters, b, n_objects, xxyy
612
+
613
+ K = self.search_nframes # Number of keyframes to find
614
+ # Estimate the total number of iterations based on search_budget and frames per iteration
615
+ video_length = int(self.total_frame_num)
616
+
617
+ # Initialize tqdm progress bar
618
+ progress_bar = tqdm(total=video_length, desc="Searching Iterations / video_length", unit="iter", dynamic_ncols=True)
619
+
620
+ while self.remaining_targets and self.search_budget > 0:
621
+ grid_rows, grid_cols = self.image_grid_shape
622
+ num_frames_in_grid = grid_rows * grid_cols
623
+
624
+ # Sample frames based on the current distribution
625
+ sampled_frame_secs, frames = self.sample_frames(num_frames_in_grid)
626
+ self.search_budget -= num_frames_in_grid
627
+
628
+ # Create an image grid from the sampled frames
629
+ grid_image = self.create_image_grid(frames, grid_rows, grid_cols)
630
+
631
+
632
+ # Perform object detection on the image grid
633
+ confidence_maps, detected_objects_maps = self.score_image_grids(
634
+ images=[grid_image],
635
+ image_grids=self.image_grid_shape
636
+ )
637
+
638
+ self.image_grid_iters.append([grid_image])
639
+ self.detect_annotot_iters.append(self.owl.bbox_visualization(images=[grid_image], detections_inbatch=self.owl.detections_inbatch))
640
+ self.detect_bbox_iters.append(self.owl.detections_inbatch)
641
+
642
+ # Update frame distributions based on detection results
643
+ frame_confidences, frame_detected_objects = self.update_frame_distribution(
644
+ sampled_frame_indices=sampled_frame_secs,
645
+ confidence_maps=confidence_maps,
646
+ detected_objects_maps=detected_objects_maps
647
+ )
648
+
649
+ # Verify and remove detected targets
650
+ for frame_sec, detected_objects in zip(sampled_frame_secs, frame_detected_objects):
651
+ self.verify_and_remove_target(
652
+ frame_sec=frame_sec,
653
+ detected_objects=detected_objects,
654
+ confidence_threshold=self.confidence_threshold,
655
+ )
656
+ # Update the progress bar
657
+ progress_bar.update(1)
658
+
659
+ # Close the progress bar once the loop is done
660
+ progress_bar.close()
661
+ # Select top K frames based on the score distribution
662
+ top_k_indices = np.argsort(self.score_distribution)[-K:][::-1]
663
+ top_k_frames = []
664
+ time_stamps = []
665
+
666
+ # Read and store the top K frames
667
+ for idx in top_k_indices:
668
+ frame_idx = int(idx * self.raw_fps / self.fps)
669
+ _, frame = self.read_frame_batch(self.video_path, [frame_idx])
670
+ top_k_frames.append(frame[0])
671
+ time_stamps.append(idx / self.fps)
672
+
673
+ return top_k_frames, time_stamps
674
+
675
+
676
+
677
+
678
+
679
+ # Example usage
680
+ if __name__ == "__main__":
681
+ # Define video path and target objects
682
+ video_path = "/home/anabella/projects/MLLM/TSTAR/data/friend_clip_t.mp4"
683
+ query = "what is the color of the couch?"
684
+ target_objects = ["couch"] # Target objects to find
685
+ cue_objects = ["table", "woman"]
686
+
687
+ # Create VideoSearcher instance
688
+ searcher = TStarSearcher(
689
+ video_path=video_path,
690
+ target_objects=target_objects,
691
+ cue_objects=cue_objects,
692
+ search_nframes=8,
693
+ image_grid_shape=(4, 4),
694
+ confidence_threshold=0.6
695
+ )
696
+
697
+ # Perform the search
698
+ all_frames, time_stamps = searcher.search()
699
+
700
+ # Process results
701
+ print(f"Found {len(all_frames)} frames, timestamps: {time_stamps}")
702
+
703
+ # Plot the score distribution
704
+ searcher.plot_score_distribution(save_path='./output/score/score_distribution.png')
TStar/interface_yolo.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import cv2
4
+ import os.path as osp
5
+ from mmengine.config import Config
6
+ from mmengine.dataset import Compose
7
+ from mmdet.apis import init_detector
8
+ from mmdet.utils import get_test_pipeline_cfg
9
+ # from mmengine.runner.amp import autocast
10
+ from torch.amp import autocast
11
+ import torch
12
+ import supervision as sv
13
+ from typing import Dict, Optional, Sequence, List
14
+
15
+ import supervision as sv
16
+ class LabelAnnotator(sv.LabelAnnotator):
17
+
18
+ @staticmethod
19
+ def resolve_text_background_xyxy(
20
+ center_coordinates,
21
+ text_wh,
22
+ position,
23
+ ):
24
+ center_x, center_y = center_coordinates
25
+ text_w, text_h = text_wh
26
+ return center_x, center_y, center_x + text_w, center_y + text_h
27
+
28
+
29
+ class YoloInterface:
30
+ def __init__(self):
31
+ """
32
+ Initialize the YOLO-World model with the given configuration and checkpoint.
33
+
34
+ Args:
35
+ """
36
+
37
+
38
+ pass
39
+ def set_BBoxAnnotator(self):
40
+ self.BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1)
41
+ # MASK_ANNOTATOR = sv.MaskAnnotator()
42
+ self.LABEL_ANNOTATOR = LabelAnnotator(text_padding=4,
43
+ text_scale=0.5,
44
+ text_thickness=1)
45
+
46
+ class YoloWorldInterface(YoloInterface):
47
+ def __init__(self, config_path: str, checkpoint_path: str, device: str = "cuda:0"):
48
+ """
49
+ Initialize the YOLO-World model with the given configuration and checkpoint.
50
+
51
+ Args:
52
+ config_path (str): Path to the model configuration file.
53
+ checkpoint_path (str): Path to the model checkpoint.
54
+ device (str): Device to run the model on (e.g., 'cuda:0', 'cpu').
55
+ """
56
+ self.config_path = config_path
57
+ self.checkpoint_path = checkpoint_path
58
+ self.device = device
59
+
60
+ # Load configuration
61
+ cfg = Config.fromfile(config_path)
62
+ cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(config_path))[0])
63
+ cfg.load_from = checkpoint_path
64
+
65
+ # Initialize the model
66
+ self.model = init_detector(cfg, checkpoint=checkpoint_path, device=device)
67
+ self.set_BBoxAnnotator()
68
+
69
+ # Initialize the test pipeline
70
+ # build test pipeline
71
+ self.model.cfg.test_dataloader.dataset.pipeline[
72
+ 0].type = 'mmdet.LoadImageFromNDArray'
73
+ self.test_pipeline = Compose(self.model.cfg.test_dataloader.dataset.pipeline)
74
+
75
+
76
+
77
+ def reparameterize_object_list(self, target_objects: List[str], cue_objects: List[str]):
78
+ """
79
+ Reparameterize the detect object list to be used by the YOLO model.
80
+
81
+ Args:
82
+ target_objects (List[str]): List of target object names.
83
+ cue_objects (List[str]): List of cue object names.
84
+ """
85
+ # Combine target objects and cue objects into the final text format
86
+ combined_texts = target_objects + cue_objects
87
+
88
+ # Format the text prompts for the YOLO model
89
+ self.texts = [[obj.strip()] for obj in combined_texts] + [[' ']]
90
+
91
+ # Reparameterize the YOLO model with the provided text prompts
92
+ self.model.reparameterize(self.texts)
93
+
94
+
95
+ def inference(self, image: str, max_dets: int = 100, score_threshold: float = 0.3, use_amp: bool = False):
96
+ """
97
+ Run inference on a single image.
98
+
99
+ Args:
100
+ image (str): Path to the image.
101
+ max_dets (int): Maximum number of detections to keep.
102
+ score_threshold (float): Score threshold for filtering detections.
103
+ use_amp (bool): Whether to use mixed precision for inference.
104
+
105
+ Returns:
106
+ sv.Detections: Detection results.
107
+ """
108
+ # Prepare data for inference
109
+ data_info = dict(img_id=0, img_path=image, texts=self.texts)
110
+ data_info = self.test_pipeline(data_info)
111
+ data_batch = dict(inputs=data_info['inputs'].unsqueeze(0),
112
+ data_samples=[data_info['data_samples']])
113
+
114
+ # Run inference
115
+ with autocast(enabled=use_amp), torch.no_grad():
116
+ output = self.model.test_step(data_batch)[0]
117
+ pred_instances = output.pred_instances
118
+ pred_instances = pred_instances[pred_instances.scores.float() > score_threshold]
119
+
120
+ if len(pred_instances.scores) > max_dets:
121
+ indices = pred_instances.scores.float().topk(max_dets)[1]
122
+ pred_instances = pred_instances[indices]
123
+
124
+ pred_instances = pred_instances.cpu().numpy()
125
+
126
+ # Process detections
127
+ detections = sv.Detections(
128
+ xyxy=pred_instances['bboxes'],
129
+ class_id=pred_instances['labels'],
130
+ confidence=pred_instances['scores'],
131
+ mask=pred_instances.get('masks', None)
132
+ )
133
+ return detections
134
+
135
+ def inference_detector(self, images, max_dets=50, score_threshold=0.2, use_amp: bool = False):
136
+ data_info = dict(img_id=0, img=images[0], texts=self.texts) #TBD for batch searching
137
+ data_info = self.test_pipeline(data_info)
138
+ data_batch = dict(inputs=data_info['inputs'].unsqueeze(0),
139
+ data_samples=[data_info['data_samples']])
140
+ detections_inbatch = []
141
+ with torch.no_grad():
142
+ outputs = self.model.test_step(data_batch)
143
+ # cover to searcher interface format
144
+
145
+ for output in outputs:
146
+ pred_instances = output.pred_instances
147
+ pred_instances = pred_instances[pred_instances.scores.float() >
148
+ score_threshold]
149
+ if len(pred_instances.scores) > max_dets:
150
+ indices = pred_instances.scores.float().topk(max_dets)[1]
151
+ pred_instances = pred_instances[indices]
152
+
153
+ output.pred_instances = pred_instances
154
+
155
+ if 'masks' in pred_instances:
156
+ masks = pred_instances['masks']
157
+ else:
158
+ masks = None
159
+ pred_instances = pred_instances.cpu().numpy()
160
+ detections = sv.Detections(xyxy=pred_instances['bboxes'],
161
+ class_id=pred_instances['labels'],
162
+ confidence=pred_instances['scores'],
163
+ mask=masks)
164
+ detections_inbatch.append(detections)
165
+ self.detect_outputs_raw = outputs
166
+ self.detections_inbatch = detections_inbatch
167
+ return detections_inbatch
168
+
169
+ def bbox_visualization(self, images, detections_inbatch):
170
+ anno_images = []
171
+ # detections_inbatch = self.detections_inbatch
172
+ for b, detections in enumerate(detections_inbatch):
173
+ texts = self.texts
174
+ labels = [
175
+ f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in
176
+ zip(detections.class_id, detections.confidence)
177
+ ]
178
+
179
+
180
+ index = len(detections_inbatch) -1
181
+ image = images[index]
182
+ anno_image = image.copy()
183
+
184
+
185
+ anno_image = self.BOUNDING_BOX_ANNOTATOR.annotate(anno_image, detections)
186
+ anno_image = self.LABEL_ANNOTATOR.annotate(anno_image, detections, labels=labels)
187
+ anno_images.append(anno_image)
188
+
189
+ return anno_images
190
+
191
+
192
+
193
+ import torch
194
+ from typing import List
195
+ import supervision as sv # 确保已安装 Supervision 库
196
+ import os.path as osp
197
+
198
+ class YoloV5Interface(YoloInterface):
199
+ def __init__(self,config_path="ultralytics/yolov5", checkpoint_path: str = 'yolov5s', device: str = 'cuda:0'):
200
+ """
201
+ 初始化 YOLOv5 模型。
202
+
203
+ Args:
204
+ model_name (str): YOLOv5 模型变体名称(如 'yolov5s', 'yolov5m', 'yolov5l', 'yolov5x')。
205
+ device (str): 运行模型的设备(如 'cuda:0', 'cpu')。
206
+ """
207
+ self.device = device
208
+ self.model = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=True)
209
+
210
+ self.model.to(self.device)
211
+ self.model.eval()
212
+ self.target_classes = None # 用于存储目标类别列表
213
+
214
+ self.texts = None
215
+ self.test_pipeline = None
216
+
217
+ def reparameterize_object_list(self, target_objects: List[str], cue_objects: List[str]):
218
+ """
219
+ 重新参数化检测对象列表,以便在推理时使用。
220
+
221
+ Args:
222
+ target_objects (List[str]): 目标对象名称列表。
223
+ cue_objects (List[str]): 线索对象名称列表。
224
+ """
225
+ # 合并目标对象和线索对象
226
+ combined_objects = target_objects + cue_objects
227
+ self.target_classes = combined_objects
228
+
229
+ def inference(self, images: str, max_dets: int = 100, score_threshold: float = 0.3, use_amp: bool = False):
230
+ """
231
+ 对单张图像运行推理。
232
+
233
+ Args:
234
+ image (str): 图像路径。
235
+ max_dets (int): 保留的最大检测数量。
236
+ score_threshold (float): 过滤检测的分数阈值。
237
+ use_amp (bool): 是否使用混合精度进行推理。
238
+
239
+ Returns:
240
+ sv.Detections: 检测结果。
241
+ """
242
+ results = self.model(images, size=640) # 可以根据需要调整输入尺寸
243
+
244
+ # 提取检测结果(假设批量大小为 1)
245
+ detections_batch = results.pred # B tensors of shape (N, 6) [x1, y1, x2, y2, confidence, class]
246
+
247
+ # 应用分数阈值
248
+ # 用于存储每个批次过滤后的检测结果
249
+ filtered_detections = []
250
+
251
+ for detections in detections_batch:
252
+ # 应用分数阈值,过滤掉 confidence <= score_threshold 的检测
253
+ detections = detections[detections[:, 4] > score_threshold]
254
+ # 如果设置了 topk,截取前 topk 个检测
255
+ if len(detections) > max_dets:
256
+ detections = detections[:max_dets]
257
+ # 如果设置了目标类别,过滤检测结果
258
+ if self.target_classes is not None:
259
+ # 获取所有类别名称
260
+ class_names = self.model.names
261
+ # 获取目标类别的类别ID
262
+ target_class_ids = [i for i, name in class_names.items() if name in self.target_classes]
263
+
264
+
265
+ # 过滤检测结果
266
+ detections = detections[[cls in target_class_ids for cls in detections[:, 5]]]
267
+ # 转换为 Supervision 库的 Detections 对象
268
+ detections = sv.Detections(
269
+ xyxy=detections[:, :4].cpu().numpy(),
270
+ confidence=detections[:, 4].cpu().numpy(),
271
+ class_id=detections[:, 5].cpu().numpy().astype(int)
272
+ )
273
+
274
+
275
+ filtered_detections.append(detections)
276
+
277
+ return filtered_detections
278
+
279
+
TStar/utils.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ try:
4
+ from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
5
+ from llava.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IMAGE_TOKEN_INDEX
6
+ except ImportError:
7
+ print("Warning: LLAVA constants not found, using non-values.")
8
+ from typing import List
9
+ import math
10
+ from typing import List, Dict
11
+ from PIL import Image
12
+ import base64
13
+ import io
14
+ import numpy as np
15
+ from contextlib import contextmanager
16
+
17
+ try:
18
+ import cv2
19
+ except ImportError:
20
+ cv2 = None
21
+ print("Warning: OpenCV is not installed, video frame extraction will not work.")
22
+
23
+
24
+
25
+ def encode_image_to_base64(image) -> str:
26
+ """
27
+ Convert an image (PIL.Image or numpy.ndarray) to a Base64 encoded string.
28
+ """
29
+ try:
30
+ # If the input is a numpy array, convert it to a PIL Image
31
+ if isinstance(image, np.ndarray):
32
+ image = Image.fromarray(image)
33
+
34
+ # Ensure it's a PIL Image before proceeding
35
+ if not isinstance(image, Image.Image):
36
+ raise ValueError("Input must be a PIL.Image or numpy.ndarray")
37
+
38
+ # Encode the image to Base64
39
+ buffered = io.BytesIO()
40
+ image.save(buffered, format="JPEG")
41
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
42
+ except Exception as e:
43
+ raise ValueError(f"Error encoding image: {str(e)}")
44
+
45
+ def load_video_frames(video_path: str, num_frames: int = 8) -> List[Image.Image]:
46
+ """
47
+ 从视频中读取 num_frames 帧并返回 PIL.Image 列表。
48
+ """
49
+ if cv2 is None:
50
+ raise ImportError("OpenCV is not installed, cannot load video frames.")
51
+
52
+ frames = []
53
+ cap = cv2.VideoCapture(video_path)
54
+ if not cap.isOpened():
55
+ raise ValueError(f"Cannot open video: {video_path}")
56
+
57
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
58
+ if total_frames == 0:
59
+ cap.release()
60
+ raise ValueError("Video has zero frames or could not retrieve frame count.")
61
+
62
+ num_frames = min(num_frames, total_frames)
63
+ step = total_frames / num_frames
64
+
65
+ for i in range(num_frames):
66
+ frame_index = int(math.floor(i * step))
67
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
68
+ ret, frame = cap.read()
69
+ if not ret:
70
+ break
71
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
72
+ frames.append(Image.fromarray(frame_rgb))
73
+
74
+ cap.release()
75
+ return frames
76
+
77
+
78
+ def save_as_gif(images, output_gif_path):
79
+ from PIL import Image
80
+ import os
81
+
82
+ fps = 1 # 设置帧率为 1
83
+ duration = int(1000 / fps) # GIF 每帧显示时间,单位为毫秒
84
+
85
+ # 将每一帧图像转换为 PIL 图像
86
+ pil_images = [Image.fromarray(img.astype('uint8')) for img in images]
87
+
88
+ # 保存为 GIF
89
+ pil_images[0].save(
90
+ output_gif_path,
91
+ save_all=True,
92
+ append_images=pil_images[1:],
93
+ duration=duration,
94
+ loop=0 # 设置循环播放(0 为无限循环)
95
+ )
96
+ print(f"Saved GIF: {output_gif_path}")
97
+
98
+ @contextmanager
99
+ def NoWarning():
100
+ from logging import logger
101
+ logger.set_level(logger.ERROR)
102
+ try:
103
+ yield
104
+ finally:
105
+ logger.set_level(logger.INFO)
YOLO-World/.dockerignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ docs
2
+ Dockerfile
YOLO-World/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Basic .gitattributes for a python repo.
2
+
3
+ # Source files
4
+ # ============
5
+ *.pxd text diff=python
6
+ *.py text diff=python
7
+ *.py3 text diff=python
8
+ *.pyw text diff=python
9
+ *.pyx text diff=python
10
+ *.pyz text diff=python
11
+ *.pyi text diff=python
12
+
13
+ # Binary files
14
+ # ============
15
+ *.db binary
16
+ *.p binary
17
+ *.pkl binary
18
+ *.pickle binary
19
+ *.pyc binary export-ignore
20
+ *.pyo binary export-ignore
21
+ *.pyd binary
22
+
23
+ # Jupyter notebook
24
+ *.ipynb text eol=lf
25
+
26
+ # Others
27
+ * text=auto
28
+ *.txt text
29
+ *.sh text eol=lf
30
+
31
+ # Note: .db, .p, and .pkl files are associated
32
+ # with the python modules ``pickle``, ``dbm.*``,
33
+ # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb``
34
+ # (among others).
YOLO-World/.gitignore ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+ MANIFEST
27
+
28
+ # PyInstaller
29
+ # Usually these files are written by a python script from a template
30
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ .hypothesis/
48
+ .pytest_cache/
49
+
50
+ # Translations
51
+ *.mo
52
+ *.pot
53
+
54
+ # Django stuff:
55
+ *.log
56
+ local_settings.py
57
+ db.sqlite3
58
+
59
+ # Flask stuff:
60
+ instance/
61
+ .webassets-cache
62
+
63
+ # Scrapy stuff:
64
+ .scrapy
65
+
66
+ # Sphinx documentation
67
+ docs/en/_build/
68
+ docs/zh_cn/_build/
69
+
70
+ # PyBuilder
71
+ target/
72
+
73
+ # Jupyter Notebook
74
+ .ipynb_checkpoints
75
+
76
+ # pyenv
77
+ .python-version
78
+
79
+ # celery beat schedule file
80
+ celerybeat-schedule
81
+
82
+ # SageMath parsed files
83
+ *.sage.py
84
+
85
+ # Environments
86
+ .env
87
+ .venv
88
+ env/
89
+ venv/
90
+ ENV/
91
+ env.bak/
92
+ venv.bak/
93
+
94
+ # Spyder project settings
95
+ .spyderproject
96
+ .spyproject
97
+
98
+ # Rope project settings
99
+ .ropeproject
100
+
101
+ # mkdocs documentation
102
+ /site
103
+
104
+ # mypy
105
+ .mypy_cache/
106
+ # data/
107
+ # data
108
+ .vscode
109
+ .idea
110
+ .DS_Store
111
+
112
+ # custom
113
+ *.pkl
114
+ *.pkl.json
115
+ *.log.json
116
+ docs/modelzoo_statistics.md
117
+ mmdet/.mim
118
+ work_dirs
119
+
120
+ # Pytorch
121
+ *.pth
122
+ *.py~
123
+ *.sh~
124
+
125
+ # venus
126
+ venus_run.sh
127
+
YOLO-World/.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "third_party/mmyolo"]
2
+ path = third_party/mmyolo
3
+ url = https://github.com/onuralpszr/mmyolo.git
YOLO-World/Dockerfile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
2
+
3
+ ARG MODEL="yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py"
4
+ ARG WEIGHT="yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth"
5
+
6
+ ENV FORCE_CUDA="1"
7
+ ENV MMCV_WITH_OPS=1
8
+
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ python3-pip \
11
+ libgl1-mesa-glx \
12
+ libsm6 \
13
+ libxext6 \
14
+ libxrender-dev \
15
+ libglib2.0-0 \
16
+ git \
17
+ python3-dev \
18
+ python3-wheel
19
+
20
+ RUN pip3 install --upgrade pip \
21
+ && pip3 install \
22
+ gradio \
23
+ opencv-python \
24
+ supervision \
25
+ mmengine \
26
+ setuptools \
27
+ openmim \
28
+ && mim install mmcv==2.0.0 \
29
+ && pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cu118 \
30
+ wheel \
31
+ torch \
32
+ torchvision \
33
+ torchaudio
34
+
35
+ COPY . /yolo
36
+ WORKDIR /yolo
37
+
38
+ RUN pip3 install -e .
39
+
40
+ RUN curl -o weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT
41
+
42
+ ENTRYPOINT [ "python3", "demo.py" ]
43
+ CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"]
YOLO-World/LICENSE ADDED
@@ -0,0 +1,1347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU GENERAL PUBLIC LICENSE
2
+ Version 3, 29 June 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+ Preamble
9
+
10
+ The GNU General Public License is a free, copyleft license for
11
+ software and other kinds of works.
12
+
13
+ The licenses for most software and other practical works are designed
14
+ to take away your freedom to share and change the works. By contrast,
15
+ the GNU General Public License is intended to guarantee your freedom to
16
+ share and change all versions of a program--to make sure it remains free
17
+ software for all its users. We, the Free Software Foundation, use the
18
+ GNU General Public License for most of our software; it applies also to
19
+ any other work released this way by its authors. You can apply it to
20
+ your programs, too.
21
+
22
+ When we speak of free software, we are referring to freedom, not
23
+ price. Our General Public Licenses are designed to make sure that you
24
+ have the freedom to distribute copies of free software (and charge for
25
+ them if you wish), that you receive source code or can get it if you
26
+ want it, that you can change the software or use pieces of it in new
27
+ free programs, and that you know you can do these things.
28
+
29
+ To protect your rights, we need to prevent others from denying you
30
+ these rights or asking you to surrender the rights. Therefore, you have
31
+ certain responsibilities if you distribute copies of the software, or if
32
+ you modify it: responsibilities to respect the freedom of others.
33
+
34
+ For example, if you distribute copies of such a program, whether
35
+ gratis or for a fee, you must pass on to the recipients the same
36
+ freedoms that you received. You must make sure that they, too, receive
37
+ or can get the source code. And you must show them these terms so they
38
+ know their rights.
39
+
40
+ Developers that use the GNU GPL protect your rights with two steps:
41
+ (1) assert copyright on the software, and (2) offer you this License
42
+ giving you legal permission to copy, distribute and/or modify it.
43
+
44
+ For the developers' and authors' protection, the GPL clearly explains
45
+ that there is no warranty for this free software. For both users' and
46
+ authors' sake, the GPL requires that modified versions be marked as
47
+ changed, so that their problems will not be attributed erroneously to
48
+ authors of previous versions.
49
+
50
+ Some devices are designed to deny users access to install or run
51
+ modified versions of the software inside them, although the manufacturer
52
+ can do so. This is fundamentally incompatible with the aim of
53
+ protecting users' freedom to change the software. The systematic
54
+ pattern of such abuse occurs in the area of products for individuals to
55
+ use, which is precisely where it is most unacceptable. Therefore, we
56
+ have designed this version of the GPL to prohibit the practice for those
57
+ products. If such problems arise substantially in other domains, we
58
+ stand ready to extend this provision to those domains in future versions
59
+ of the GPL, as needed to protect the freedom of users.
60
+
61
+ Finally, every program is threatened constantly by software patents.
62
+ States should not allow patents to restrict development and use of
63
+ software on general-purpose computers, but in those that do, we wish to
64
+ avoid the special danger that patents applied to a free program could
65
+ make it effectively proprietary. To prevent this, the GPL assures that
66
+ patents cannot be used to render the program non-free.
67
+
68
+ The precise terms and conditions for copying, distribution and
69
+ modification follow.
70
+
71
+ TERMS AND CONDITIONS
72
+
73
+ 0. Definitions.
74
+
75
+ "This License" refers to version 3 of the GNU General Public License.
76
+
77
+ "Copyright" also means copyright-like laws that apply to other kinds of
78
+ works, such as semiconductor masks.
79
+
80
+ "The Program" refers to any copyrightable work licensed under this
81
+ License. Each licensee is addressed as "you". "Licensees" and
82
+ "recipients" may be individuals or organizations.
83
+
84
+ To "modify" a work means to copy from or adapt all or part of the work
85
+ in a fashion requiring copyright permission, other than the making of an
86
+ exact copy. The resulting work is called a "modified version" of the
87
+ earlier work or a work "based on" the earlier work.
88
+
89
+ A "covered work" means either the unmodified Program or a work based
90
+ on the Program.
91
+
92
+ To "propagate" a work means to do anything with it that, without
93
+ permission, would make you directly or secondarily liable for
94
+ infringement under applicable copyright law, except executing it on a
95
+ computer or modifying a private copy. Propagation includes copying,
96
+ distribution (with or without modification), making available to the
97
+ public, and in some countries other activities as well.
98
+
99
+ To "convey" a work means any kind of propagation that enables other
100
+ parties to make or receive copies. Mere interaction with a user through
101
+ a computer network, with no transfer of a copy, is not conveying.
102
+
103
+ An interactive user interface displays "Appropriate Legal Notices"
104
+ to the extent that it includes a convenient and prominently visible
105
+ feature that (1) displays an appropriate copyright notice, and (2)
106
+ tells the user that there is no warranty for the work (except to the
107
+ extent that warranties are provided), that licensees may convey the
108
+ work under this License, and how to view a copy of this License. If
109
+ the interface presents a list of user commands or options, such as a
110
+ menu, a prominent item in the list meets this criterion.
111
+
112
+ 1. Source Code.
113
+
114
+ The "source code" for a work means the preferred form of the work
115
+ for making modifications to it. "Object code" means any non-source
116
+ form of a work.
117
+
118
+ A "Standard Interface" means an interface that either is an official
119
+ standard defined by a recognized standards body, or, in the case of
120
+ interfaces specified for a particular programming language, one that
121
+ is widely used among developers working in that language.
122
+
123
+ The "System Libraries" of an executable work include anything, other
124
+ than the work as a whole, that (a) is included in the normal form of
125
+ packaging a Major Component, but which is not part of that Major
126
+ Component, and (b) serves only to enable use of the work with that
127
+ Major Component, or to implement a Standard Interface for which an
128
+ implementation is available to the public in source code form. A
129
+ "Major Component", in this context, means a major essential component
130
+ (kernel, window system, and so on) of the specific operating system
131
+ (if any) on which the executable work runs, or a compiler used to
132
+ produce the work, or an object code interpreter used to run it.
133
+
134
+ The "Corresponding Source" for a work in object code form means all
135
+ the source code needed to generate, install, and (for an executable
136
+ work) run the object code and to modify the work, including scripts to
137
+ control those activities. However, it does not include the work's
138
+ System Libraries, or general-purpose tools or generally available free
139
+ programs which are used unmodified in performing those activities but
140
+ which are not part of the work. For example, Corresponding Source
141
+ includes interface definition files associated with source files for
142
+ the work, and the source code for shared libraries and dynamically
143
+ linked subprograms that the work is specifically designed to require,
144
+ such as by intimate data communication or control flow between those
145
+ subprograms and other parts of the work.
146
+
147
+ The Corresponding Source need not include anything that users
148
+ can regenerate automatically from other parts of the Corresponding
149
+ Source.
150
+
151
+ The Corresponding Source for a work in source code form is that
152
+ same work.
153
+
154
+ 2. Basic Permissions.
155
+
156
+ All rights granted under this License are granted for the term of
157
+ copyright on the Program, and are irrevocable provided the stated
158
+ conditions are met. This License explicitly affirms your unlimited
159
+ permission to run the unmodified Program. The output from running a
160
+ covered work is covered by this License only if the output, given its
161
+ content, constitutes a covered work. This License acknowledges your
162
+ rights of fair use or other equivalent, as provided by copyright law.
163
+
164
+ You may make, run and propagate covered works that you do not
165
+ convey, without conditions so long as your license otherwise remains
166
+ in force. You may convey covered works to others for the sole purpose
167
+ of having them make modifications exclusively for you, or provide you
168
+ with facilities for running those works, provided that you comply with
169
+ the terms of this License in conveying all material for which you do
170
+ not control copyright. Those thus making or running the covered works
171
+ for you must do so exclusively on your behalf, under your direction
172
+ and control, on terms that prohibit them from making any copies of
173
+ your copyrighted material outside their relationship with you.
174
+
175
+ Conveying under any other circumstances is permitted solely under
176
+ the conditions stated below. Sublicensing is not allowed; section 10
177
+ makes it unnecessary.
178
+
179
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180
+
181
+ No covered work shall be deemed part of an effective technological
182
+ measure under any applicable law fulfilling obligations under article
183
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184
+ similar laws prohibiting or restricting circumvention of such
185
+ measures.
186
+
187
+ When you convey a covered work, you waive any legal power to forbid
188
+ circumvention of technological measures to the extent such circumvention
189
+ is effected by exercising rights under this License with respect to
190
+ the covered work, and you disclaim any intention to limit operation or
191
+ modification of the work as a means of enforcing, against the work's
192
+ users, your or third parties' legal rights to forbid circumvention of
193
+ technological measures.
194
+
195
+ 4. Conveying Verbatim Copies.
196
+
197
+ You may convey verbatim copies of the Program's source code as you
198
+ receive it, in any medium, provided that you conspicuously and
199
+ appropriately publish on each copy an appropriate copyright notice;
200
+ keep intact all notices stating that this License and any
201
+ non-permissive terms added in accord with section 7 apply to the code;
202
+ keep intact all notices of the absence of any warranty; and give all
203
+ recipients a copy of this License along with the Program.
204
+
205
+ You may charge any price or no price for each copy that you convey,
206
+ and you may offer support or warranty protection for a fee.
207
+
208
+ 5. Conveying Modified Source Versions.
209
+
210
+ You may convey a work based on the Program, or the modifications to
211
+ produce it from the Program, in the form of source code under the
212
+ terms of section 4, provided that you also meet all of these conditions:
213
+
214
+ a) The work must carry prominent notices stating that you modified
215
+ it, and giving a relevant date.
216
+
217
+ b) The work must carry prominent notices stating that it is
218
+ released under this License and any conditions added under section
219
+ 7. This requirement modifies the requirement in section 4 to
220
+ "keep intact all notices".
221
+
222
+ c) You must license the entire work, as a whole, under this
223
+ License to anyone who comes into possession of a copy. This
224
+ License will therefore apply, along with any applicable section 7
225
+ additional terms, to the whole of the work, and all its parts,
226
+ regardless of how they are packaged. This License gives no
227
+ permission to license the work in any other way, but it does not
228
+ invalidate such permission if you have separately received it.
229
+
230
+ d) If the work has interactive user interfaces, each must display
231
+ Appropriate Legal Notices; however, if the Program has interactive
232
+ interfaces that do not display Appropriate Legal Notices, your
233
+ work need not make them do so.
234
+
235
+ A compilation of a covered work with other separate and independent
236
+ works, which are not by their nature extensions of the covered work,
237
+ and which are not combined with it such as to form a larger program,
238
+ in or on a volume of a storage or distribution medium, is called an
239
+ "aggregate" if the compilation and its resulting copyright are not
240
+ used to limit the access or legal rights of the compilation's users
241
+ beyond what the individual works permit. Inclusion of a covered work
242
+ in an aggregate does not cause this License to apply to the other
243
+ parts of the aggregate.
244
+
245
+ 6. Conveying Non-Source Forms.
246
+
247
+ You may convey a covered work in object code form under the terms
248
+ of sections 4 and 5, provided that you also convey the
249
+ machine-readable Corresponding Source under the terms of this License,
250
+ in one of these ways:
251
+
252
+ a) Convey the object code in, or embodied in, a physical product
253
+ (including a physical distribution medium), accompanied by the
254
+ Corresponding Source fixed on a durable physical medium
255
+ customarily used for software interchange.
256
+
257
+ b) Convey the object code in, or embodied in, a physical product
258
+ (including a physical distribution medium), accompanied by a
259
+ written offer, valid for at least three years and valid for as
260
+ long as you offer spare parts or customer support for that product
261
+ model, to give anyone who possesses the object code either (1) a
262
+ copy of the Corresponding Source for all the software in the
263
+ product that is covered by this License, on a durable physical
264
+ medium customarily used for software interchange, for a price no
265
+ more than your reasonable cost of physically performing this
266
+ conveying of source, or (2) access to copy the
267
+ Corresponding Source from a network server at no charge.
268
+
269
+ c) Convey individual copies of the object code with a copy of the
270
+ written offer to provide the Corresponding Source. This
271
+ alternative is allowed only occasionally and noncommercially, and
272
+ only if you received the object code with such an offer, in accord
273
+ with subsection 6b.
274
+
275
+ d) Convey the object code by offering access from a designated
276
+ place (gratis or for a charge), and offer equivalent access to the
277
+ Corresponding Source in the same way through the same place at no
278
+ further charge. You need not require recipients to copy the
279
+ Corresponding Source along with the object code. If the place to
280
+ copy the object code is a network server, the Corresponding Source
281
+ may be on a different server (operated by you or a third party)
282
+ that supports equivalent copying facilities, provided you maintain
283
+ clear directions next to the object code saying where to find the
284
+ Corresponding Source. Regardless of what server hosts the
285
+ Corresponding Source, you remain obligated to ensure that it is
286
+ available for as long as needed to satisfy these requirements.
287
+
288
+ e) Convey the object code using peer-to-peer transmission, provided
289
+ you inform other peers where the object code and Corresponding
290
+ Source of the work are being offered to the general public at no
291
+ charge under subsection 6d.
292
+
293
+ A separable portion of the object code, whose source code is excluded
294
+ from the Corresponding Source as a System Library, need not be
295
+ included in conveying the object code work.
296
+
297
+ A "User Product" is either (1) a "consumer product", which means any
298
+ tangible personal property which is normally used for personal, family,
299
+ or household purposes, or (2) anything designed or sold for incorporation
300
+ into a dwelling. In determining whether a product is a consumer product,
301
+ doubtful cases shall be resolved in favor of coverage. For a particular
302
+ product received by a particular user, "normally used" refers to a
303
+ typical or common use of that class of product, regardless of the status
304
+ of the particular user or of the way in which the particular user
305
+ actually uses, or expects or is expected to use, the product. A product
306
+ is a consumer product regardless of whether the product has substantial
307
+ commercial, industrial or non-consumer uses, unless such uses represent
308
+ the only significant mode of use of the product.
309
+
310
+ "Installation Information" for a User Product means any methods,
311
+ procedures, authorization keys, or other information required to install
312
+ and execute modified versions of a covered work in that User Product from
313
+ a modified version of its Corresponding Source. The information must
314
+ suffice to ensure that the continued functioning of the modified object
315
+ code is in no case prevented or interfered with solely because
316
+ modification has been made.
317
+
318
+ If you convey an object code work under this section in, or with, or
319
+ specifically for use in, a User Product, and the conveying occurs as
320
+ part of a transaction in which the right of possession and use of the
321
+ User Product is transferred to the recipient in perpetuity or for a
322
+ fixed term (regardless of how the transaction is characterized), the
323
+ Corresponding Source conveyed under this section must be accompanied
324
+ by the Installation Information. But this requirement does not apply
325
+ if neither you nor any third party retains the ability to install
326
+ modified object code on the User Product (for example, the work has
327
+ been installed in ROM).
328
+
329
+ The requirement to provide Installation Information does not include a
330
+ requirement to continue to provide support service, warranty, or updates
331
+ for a work that has been modified or installed by the recipient, or for
332
+ the User Product in which it has been modified or installed. Access to a
333
+ network may be denied when the modification itself materially and
334
+ adversely affects the operation of the network or violates the rules and
335
+ protocols for communication across the network.
336
+
337
+ Corresponding Source conveyed, and Installation Information provided,
338
+ in accord with this section must be in a format that is publicly
339
+ documented (and with an implementation available to the public in
340
+ source code form), and must require no special password or key for
341
+ unpacking, reading or copying.
342
+
343
+ 7. Additional Terms.
344
+
345
+ "Additional permissions" are terms that supplement the terms of this
346
+ License by making exceptions from one or more of its conditions.
347
+ Additional permissions that are applicable to the entire Program shall
348
+ be treated as though they were included in this License, to the extent
349
+ that they are valid under applicable law. If additional permissions
350
+ apply only to part of the Program, that part may be used separately
351
+ under those permissions, but the entire Program remains governed by
352
+ this License without regard to the additional permissions.
353
+
354
+ When you convey a copy of a covered work, you may at your option
355
+ remove any additional permissions from that copy, or from any part of
356
+ it. (Additional permissions may be written to require their own
357
+ removal in certain cases when you modify the work.) You may place
358
+ additional permissions on material, added by you to a covered work,
359
+ for which you have or can give appropriate copyright permission.
360
+
361
+ Notwithstanding any other provision of this License, for material you
362
+ add to a covered work, you may (if authorized by the copyright holders of
363
+ that material) supplement the terms of this License with terms:
364
+
365
+ a) Disclaiming warranty or limiting liability differently from the
366
+ terms of sections 15 and 16 of this License; or
367
+
368
+ b) Requiring preservation of specified reasonable legal notices or
369
+ author attributions in that material or in the Appropriate Legal
370
+ Notices displayed by works containing it; or
371
+
372
+ c) Prohibiting misrepresentation of the origin of that material, or
373
+ requiring that modified versions of such material be marked in
374
+ reasonable ways as different from the original version; or
375
+
376
+ d) Limiting the use for publicity purposes of names of licensors or
377
+ authors of the material; or
378
+
379
+ e) Declining to grant rights under trademark law for use of some
380
+ trade names, trademarks, or service marks; or
381
+
382
+ f) Requiring indemnification of licensors and authors of that
383
+ material by anyone who conveys the material (or modified versions of
384
+ it) with contractual assumptions of liability to the recipient, for
385
+ any liability that these contractual assumptions directly impose on
386
+ those licensors and authors.
387
+
388
+ All other non-permissive additional terms are considered "further
389
+ restrictions" within the meaning of section 10. If the Program as you
390
+ received it, or any part of it, contains a notice stating that it is
391
+ governed by this License along with a term that is a further
392
+ restriction, you may remove that term. If a license document contains
393
+ a further restriction but permits relicensing or conveying under this
394
+ License, you may add to a covered work material governed by the terms
395
+ of that license document, provided that the further restriction does
396
+ not survive such relicensing or conveying.
397
+
398
+ If you add terms to a covered work in accord with this section, you
399
+ must place, in the relevant source files, a statement of the
400
+ additional terms that apply to those files, or a notice indicating
401
+ where to find the applicable terms.
402
+
403
+ Additional terms, permissive or non-permissive, may be stated in the
404
+ form of a separately written license, or stated as exceptions;
405
+ the above requirements apply either way.
406
+
407
+ 8. Termination.
408
+
409
+ You may not propagate or modify a covered work except as expressly
410
+ provided under this License. Any attempt otherwise to propagate or
411
+ modify it is void, and will automatically terminate your rights under
412
+ this License (including any patent licenses granted under the third
413
+ paragraph of section 11).
414
+
415
+ However, if you cease all violation of this License, then your
416
+ license from a particular copyright holder is reinstated (a)
417
+ provisionally, unless and until the copyright holder explicitly and
418
+ finally terminates your license, and (b) permanently, if the copyright
419
+ holder fails to notify you of the violation by some reasonable means
420
+ prior to 60 days after the cessation.
421
+
422
+ Moreover, your license from a particular copyright holder is
423
+ reinstated permanently if the copyright holder notifies you of the
424
+ violation by some reasonable means, this is the first time you have
425
+ received notice of violation of this License (for any work) from that
426
+ copyright holder, and you cure the violation prior to 30 days after
427
+ your receipt of the notice.
428
+
429
+ Termination of your rights under this section does not terminate the
430
+ licenses of parties who have received copies or rights from you under
431
+ this License. If your rights have been terminated and not permanently
432
+ reinstated, you do not qualify to receive new licenses for the same
433
+ material under section 10.
434
+
435
+ 9. Acceptance Not Required for Having Copies.
436
+
437
+ You are not required to accept this License in order to receive or
438
+ run a copy of the Program. Ancillary propagation of a covered work
439
+ occurring solely as a consequence of using peer-to-peer transmission
440
+ to receive a copy likewise does not require acceptance. However,
441
+ nothing other than this License grants you permission to propagate or
442
+ modify any covered work. These actions infringe copyright if you do
443
+ not accept this License. Therefore, by modifying or propagating a
444
+ covered work, you indicate your acceptance of this License to do so.
445
+
446
+ 10. Automatic Licensing of Downstream Recipients.
447
+
448
+ Each time you convey a covered work, the recipient automatically
449
+ receives a license from the original licensors, to run, modify and
450
+ propagate that work, subject to this License. You are not responsible
451
+ for enforcing compliance by third parties with this License.
452
+
453
+ An "entity transaction" is a transaction transferring control of an
454
+ organization, or substantially all assets of one, or subdividing an
455
+ organization, or merging organizations. If propagation of a covered
456
+ work results from an entity transaction, each party to that
457
+ transaction who receives a copy of the work also receives whatever
458
+ licenses to the work the party's predecessor in interest had or could
459
+ give under the previous paragraph, plus a right to possession of the
460
+ Corresponding Source of the work from the predecessor in interest, if
461
+ the predecessor has it or can get it with reasonable efforts.
462
+
463
+ You may not impose any further restrictions on the exercise of the
464
+ rights granted or affirmed under this License. For example, you may
465
+ not impose a license fee, royalty, or other charge for exercise of
466
+ rights granted under this License, and you may not initiate litigation
467
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
468
+ any patent claim is infringed by making, using, selling, offering for
469
+ sale, or importing the Program or any portion of it.
470
+
471
+ 11. Patents.
472
+
473
+ A "contributor" is a copyright holder who authorizes use under this
474
+ License of the Program or a work on which the Program is based. The
475
+ work thus licensed is called the contributor's "contributor version".
476
+
477
+ A contributor's "essential patent claims" are all patent claims
478
+ owned or controlled by the contributor, whether already acquired or
479
+ hereafter acquired, that would be infringed by some manner, permitted
480
+ by this License, of making, using, or selling its contributor version,
481
+ but do not include claims that would be infringed only as a
482
+ consequence of further modification of the contributor version. For
483
+ purposes of this definition, "control" includes the right to grant
484
+ patent sublicenses in a manner consistent with the requirements of
485
+ this License.
486
+
487
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
488
+ patent license under the contributor's essential patent claims, to
489
+ make, use, sell, offer for sale, import and otherwise run, modify and
490
+ propagate the contents of its contributor version.
491
+
492
+ In the following three paragraphs, a "patent license" is any express
493
+ agreement or commitment, however denominated, not to enforce a patent
494
+ (such as an express permission to practice a patent or covenant not to
495
+ sue for patent infringement). To "grant" such a patent license to a
496
+ party means to make such an agreement or commitment not to enforce a
497
+ patent against the party.
498
+
499
+ If you convey a covered work, knowingly relying on a patent license,
500
+ and the Corresponding Source of the work is not available for anyone
501
+ to copy, free of charge and under the terms of this License, through a
502
+ publicly available network server or other readily accessible means,
503
+ then you must either (1) cause the Corresponding Source to be so
504
+ available, or (2) arrange to deprive yourself of the benefit of the
505
+ patent license for this particular work, or (3) arrange, in a manner
506
+ consistent with the requirements of this License, to extend the patent
507
+ license to downstream recipients. "Knowingly relying" means you have
508
+ actual knowledge that, but for the patent license, your conveying the
509
+ covered work in a country, or your recipient's use of the covered work
510
+ in a country, would infringe one or more identifiable patents in that
511
+ country that you have reason to believe are valid.
512
+
513
+ If, pursuant to or in connection with a single transaction or
514
+ arrangement, you convey, or propagate by procuring conveyance of, a
515
+ covered work, and grant a patent license to some of the parties
516
+ receiving the covered work authorizing them to use, propagate, modify
517
+ or convey a specific copy of the covered work, then the patent license
518
+ you grant is automatically extended to all recipients of the covered
519
+ work and works based on it.
520
+
521
+ A patent license is "discriminatory" if it does not include within
522
+ the scope of its coverage, prohibits the exercise of, or is
523
+ conditioned on the non-exercise of one or more of the rights that are
524
+ specifically granted under this License. You may not convey a covered
525
+ work if you are a party to an arrangement with a third party that is
526
+ in the business of distributing software, under which you make payment
527
+ to the third party based on the extent of your activity of conveying
528
+ the work, and under which the third party grants, to any of the
529
+ parties who would receive the covered work from you, a discriminatory
530
+ patent license (a) in connection with copies of the covered work
531
+ conveyed by you (or copies made from those copies), or (b) primarily
532
+ for and in connection with specific products or compilations that
533
+ contain the covered work, unless you entered into that arrangement,
534
+ or that patent license was granted, prior to 28 March 2007.
535
+
536
+ Nothing in this License shall be construed as excluding or limiting
537
+ any implied license or other defenses to infringement that may
538
+ otherwise be available to you under applicable patent law.
539
+
540
+ 12. No Surrender of Others' Freedom.
541
+
542
+ If conditions are imposed on you (whether by court order, agreement or
543
+ otherwise) that contradict the conditions of this License, they do not
544
+ excuse you from the conditions of this License. If you cannot convey a
545
+ covered work so as to satisfy simultaneously your obligations under this
546
+ License and any other pertinent obligations, then as a consequence you may
547
+ not convey it at all. For example, if you agree to terms that obligate you
548
+ to collect a royalty for further conveying from those to whom you convey
549
+ the Program, the only way you could satisfy both those terms and this
550
+ License would be to refrain entirely from conveying the Program.
551
+
552
+ 13. Use with the GNU Affero General Public License.
553
+
554
+ Notwithstanding any other provision of this License, you have
555
+ permission to link or combine any covered work with a work licensed
556
+ under version 3 of the GNU Affero General Public License into a single
557
+ combined work, and to convey the resulting work. The terms of this
558
+ License will continue to apply to the part which is the covered work,
559
+ but the special requirements of the GNU Affero General Public License,
560
+ section 13, concerning interaction through a network will apply to the
561
+ combination as such.
562
+
563
+ 14. Revised Versions of this License.
564
+
565
+ The Free Software Foundation may publish revised and/or new versions of
566
+ the GNU General Public License from time to time. Such new versions will
567
+ be similar in spirit to the present version, but may differ in detail to
568
+ address new problems or concerns.
569
+
570
+ Each version is given a distinguishing version number. If the
571
+ Program specifies that a certain numbered version of the GNU General
572
+ Public License "or any later version" applies to it, you have the
573
+ option of following the terms and conditions either of that numbered
574
+ version or of any later version published by the Free Software
575
+ Foundation. If the Program does not specify a version number of the
576
+ GNU General Public License, you may choose any version ever published
577
+ by the Free Software Foundation.
578
+
579
+ If the Program specifies that a proxy can decide which future
580
+ versions of the GNU General Public License can be used, that proxy's
581
+ public statement of acceptance of a version permanently authorizes you
582
+ to choose that version for the Program.
583
+
584
+ Later license versions may give you additional or different
585
+ permissions. However, no additional obligations are imposed on any
586
+ author or copyright holder as a result of your choosing to follow a
587
+ later version.
588
+
589
+ 15. Disclaimer of Warranty.
590
+
591
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599
+
600
+ 16. Limitation of Liability.
601
+
602
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610
+ SUCH DAMAGES.
611
+
612
+ 17. Interpretation of Sections 15 and 16.
613
+
614
+ If the disclaimer of warranty and limitation of liability provided
615
+ above cannot be given local legal effect according to their terms,
616
+ reviewing courts shall apply local law that most closely approximates
617
+ an absolute waiver of all civil liability in connection with the
618
+ Program, unless a warranty or assumption of liability accompanies a
619
+ copy of the Program in return for a fee.
620
+
621
+ END OF TERMS AND CONDITIONS
622
+
623
+ How to Apply These Terms to Your New Programs
624
+
625
+ If you develop a new program, and you want it to be of the greatest
626
+ possible use to the public, the best way to achieve this is to make it
627
+ free software which everyone can redistribute and change under these terms.
628
+
629
+ To do so, attach the following notices to the program. It is safest
630
+ to attach them to the start of each source file to most effectively
631
+ state the exclusion of warranty; and each file should have at least
632
+ the "copyright" line and a pointer to where the full notice is found.
633
+
634
+ <one line to give the program's name and a brief idea of what it does.>
635
+ Copyright (C) <year> <name of author>
636
+
637
+ This program is free software: you can redistribute it and/or modify
638
+ it under the terms of the GNU General Public License as published by
639
+ the Free Software Foundation, either version 3 of the License, or
640
+ (at your option) any later version.
641
+
642
+ This program is distributed in the hope that it will be useful,
643
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
644
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645
+ GNU General Public License for more details.
646
+
647
+ You should have received a copy of the GNU General Public License
648
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
649
+
650
+ Also add information on how to contact you by electronic and paper mail.
651
+
652
+ If the program does terminal interaction, make it output a short
653
+ notice like this when it starts in an interactive mode:
654
+
655
+ <program> Copyright (C) <year> <name of author>
656
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657
+ This is free software, and you are welcome to redistribute it
658
+ under certain conditions; type `show c' for details.
659
+
660
+ The hypothetical commands `show w' and `show c' should show the appropriate
661
+ parts of the General Public License. Of course, your program's commands
662
+ might be different; for a GUI interface, you would use an "about box".
663
+
664
+ You should also get your employer (if you work as a programmer) or school,
665
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
666
+ For more information on this, and how to apply and follow the GNU GPL, see
667
+ <https://www.gnu.org/licenses/>.
668
+
669
+ The GNU General Public License does not permit incorporating your program
670
+ into proprietary programs. If your program is a subroutine library, you
671
+ may consider it more useful to permit linking proprietary applications with
672
+ the library. If this is what you want to do, use the GNU Lesser General
673
+ Public License instead of this License. But first, please read
674
+ <https://www.gnu.org/licenses/why-not-lgpl.html>. GNU GENERAL PUBLIC LICENSE
675
+ Version 3, 29 June 2007
676
+
677
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
678
+ Everyone is permitted to copy and distribute verbatim copies
679
+ of this license document, but changing it is not allowed.
680
+
681
+ Preamble
682
+
683
+ The GNU General Public License is a free, copyleft license for
684
+ software and other kinds of works.
685
+
686
+ The licenses for most software and other practical works are designed
687
+ to take away your freedom to share and change the works. By contrast,
688
+ the GNU General Public License is intended to guarantee your freedom to
689
+ share and change all versions of a program--to make sure it remains free
690
+ software for all its users. We, the Free Software Foundation, use the
691
+ GNU General Public License for most of our software; it applies also to
692
+ any other work released this way by its authors. You can apply it to
693
+ your programs, too.
694
+
695
+ When we speak of free software, we are referring to freedom, not
696
+ price. Our General Public Licenses are designed to make sure that you
697
+ have the freedom to distribute copies of free software (and charge for
698
+ them if you wish), that you receive source code or can get it if you
699
+ want it, that you can change the software or use pieces of it in new
700
+ free programs, and that you know you can do these things.
701
+
702
+ To protect your rights, we need to prevent others from denying you
703
+ these rights or asking you to surrender the rights. Therefore, you have
704
+ certain responsibilities if you distribute copies of the software, or if
705
+ you modify it: responsibilities to respect the freedom of others.
706
+
707
+ For example, if you distribute copies of such a program, whether
708
+ gratis or for a fee, you must pass on to the recipients the same
709
+ freedoms that you received. You must make sure that they, too, receive
710
+ or can get the source code. And you must show them these terms so they
711
+ know their rights.
712
+
713
+ Developers that use the GNU GPL protect your rights with two steps:
714
+ (1) assert copyright on the software, and (2) offer you this License
715
+ giving you legal permission to copy, distribute and/or modify it.
716
+
717
+ For the developers' and authors' protection, the GPL clearly explains
718
+ that there is no warranty for this free software. For both users' and
719
+ authors' sake, the GPL requires that modified versions be marked as
720
+ changed, so that their problems will not be attributed erroneously to
721
+ authors of previous versions.
722
+
723
+ Some devices are designed to deny users access to install or run
724
+ modified versions of the software inside them, although the manufacturer
725
+ can do so. This is fundamentally incompatible with the aim of
726
+ protecting users' freedom to change the software. The systematic
727
+ pattern of such abuse occurs in the area of products for individuals to
728
+ use, which is precisely where it is most unacceptable. Therefore, we
729
+ have designed this version of the GPL to prohibit the practice for those
730
+ products. If such problems arise substantially in other domains, we
731
+ stand ready to extend this provision to those domains in future versions
732
+ of the GPL, as needed to protect the freedom of users.
733
+
734
+ Finally, every program is threatened constantly by software patents.
735
+ States should not allow patents to restrict development and use of
736
+ software on general-purpose computers, but in those that do, we wish to
737
+ avoid the special danger that patents applied to a free program could
738
+ make it effectively proprietary. To prevent this, the GPL assures that
739
+ patents cannot be used to render the program non-free.
740
+
741
+ The precise terms and conditions for copying, distribution and
742
+ modification follow.
743
+
744
+ TERMS AND CONDITIONS
745
+
746
+ 0. Definitions.
747
+
748
+ "This License" refers to version 3 of the GNU General Public License.
749
+
750
+ "Copyright" also means copyright-like laws that apply to other kinds of
751
+ works, such as semiconductor masks.
752
+
753
+ "The Program" refers to any copyrightable work licensed under this
754
+ License. Each licensee is addressed as "you". "Licensees" and
755
+ "recipients" may be individuals or organizations.
756
+
757
+ To "modify" a work means to copy from or adapt all or part of the work
758
+ in a fashion requiring copyright permission, other than the making of an
759
+ exact copy. The resulting work is called a "modified version" of the
760
+ earlier work or a work "based on" the earlier work.
761
+
762
+ A "covered work" means either the unmodified Program or a work based
763
+ on the Program.
764
+
765
+ To "propagate" a work means to do anything with it that, without
766
+ permission, would make you directly or secondarily liable for
767
+ infringement under applicable copyright law, except executing it on a
768
+ computer or modifying a private copy. Propagation includes copying,
769
+ distribution (with or without modification), making available to the
770
+ public, and in some countries other activities as well.
771
+
772
+ To "convey" a work means any kind of propagation that enables other
773
+ parties to make or receive copies. Mere interaction with a user through
774
+ a computer network, with no transfer of a copy, is not conveying.
775
+
776
+ An interactive user interface displays "Appropriate Legal Notices"
777
+ to the extent that it includes a convenient and prominently visible
778
+ feature that (1) displays an appropriate copyright notice, and (2)
779
+ tells the user that there is no warranty for the work (except to the
780
+ extent that warranties are provided), that licensees may convey the
781
+ work under this License, and how to view a copy of this License. If
782
+ the interface presents a list of user commands or options, such as a
783
+ menu, a prominent item in the list meets this criterion.
784
+
785
+ 1. Source Code.
786
+
787
+ The "source code" for a work means the preferred form of the work
788
+ for making modifications to it. "Object code" means any non-source
789
+ form of a work.
790
+
791
+ A "Standard Interface" means an interface that either is an official
792
+ standard defined by a recognized standards body, or, in the case of
793
+ interfaces specified for a particular programming language, one that
794
+ is widely used among developers working in that language.
795
+
796
+ The "System Libraries" of an executable work include anything, other
797
+ than the work as a whole, that (a) is included in the normal form of
798
+ packaging a Major Component, but which is not part of that Major
799
+ Component, and (b) serves only to enable use of the work with that
800
+ Major Component, or to implement a Standard Interface for which an
801
+ implementation is available to the public in source code form. A
802
+ "Major Component", in this context, means a major essential component
803
+ (kernel, window system, and so on) of the specific operating system
804
+ (if any) on which the executable work runs, or a compiler used to
805
+ produce the work, or an object code interpreter used to run it.
806
+
807
+ The "Corresponding Source" for a work in object code form means all
808
+ the source code needed to generate, install, and (for an executable
809
+ work) run the object code and to modify the work, including scripts to
810
+ control those activities. However, it does not include the work's
811
+ System Libraries, or general-purpose tools or generally available free
812
+ programs which are used unmodified in performing those activities but
813
+ which are not part of the work. For example, Corresponding Source
814
+ includes interface definition files associated with source files for
815
+ the work, and the source code for shared libraries and dynamically
816
+ linked subprograms that the work is specifically designed to require,
817
+ such as by intimate data communication or control flow between those
818
+ subprograms and other parts of the work.
819
+
820
+ The Corresponding Source need not include anything that users
821
+ can regenerate automatically from other parts of the Corresponding
822
+ Source.
823
+
824
+ The Corresponding Source for a work in source code form is that
825
+ same work.
826
+
827
+ 2. Basic Permissions.
828
+
829
+ All rights granted under this License are granted for the term of
830
+ copyright on the Program, and are irrevocable provided the stated
831
+ conditions are met. This License explicitly affirms your unlimited
832
+ permission to run the unmodified Program. The output from running a
833
+ covered work is covered by this License only if the output, given its
834
+ content, constitutes a covered work. This License acknowledges your
835
+ rights of fair use or other equivalent, as provided by copyright law.
836
+
837
+ You may make, run and propagate covered works that you do not
838
+ convey, without conditions so long as your license otherwise remains
839
+ in force. You may convey covered works to others for the sole purpose
840
+ of having them make modifications exclusively for you, or provide you
841
+ with facilities for running those works, provided that you comply with
842
+ the terms of this License in conveying all material for which you do
843
+ not control copyright. Those thus making or running the covered works
844
+ for you must do so exclusively on your behalf, under your direction
845
+ and control, on terms that prohibit them from making any copies of
846
+ your copyrighted material outside their relationship with you.
847
+
848
+ Conveying under any other circumstances is permitted solely under
849
+ the conditions stated below. Sublicensing is not allowed; section 10
850
+ makes it unnecessary.
851
+
852
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
853
+
854
+ No covered work shall be deemed part of an effective technological
855
+ measure under any applicable law fulfilling obligations under article
856
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
857
+ similar laws prohibiting or restricting circumvention of such
858
+ measures.
859
+
860
+ When you convey a covered work, you waive any legal power to forbid
861
+ circumvention of technological measures to the extent such circumvention
862
+ is effected by exercising rights under this License with respect to
863
+ the covered work, and you disclaim any intention to limit operation or
864
+ modification of the work as a means of enforcing, against the work's
865
+ users, your or third parties' legal rights to forbid circumvention of
866
+ technological measures.
867
+
868
+ 4. Conveying Verbatim Copies.
869
+
870
+ You may convey verbatim copies of the Program's source code as you
871
+ receive it, in any medium, provided that you conspicuously and
872
+ appropriately publish on each copy an appropriate copyright notice;
873
+ keep intact all notices stating that this License and any
874
+ non-permissive terms added in accord with section 7 apply to the code;
875
+ keep intact all notices of the absence of any warranty; and give all
876
+ recipients a copy of this License along with the Program.
877
+
878
+ You may charge any price or no price for each copy that you convey,
879
+ and you may offer support or warranty protection for a fee.
880
+
881
+ 5. Conveying Modified Source Versions.
882
+
883
+ You may convey a work based on the Program, or the modifications to
884
+ produce it from the Program, in the form of source code under the
885
+ terms of section 4, provided that you also meet all of these conditions:
886
+
887
+ a) The work must carry prominent notices stating that you modified
888
+ it, and giving a relevant date.
889
+
890
+ b) The work must carry prominent notices stating that it is
891
+ released under this License and any conditions added under section
892
+ 7. This requirement modifies the requirement in section 4 to
893
+ "keep intact all notices".
894
+
895
+ c) You must license the entire work, as a whole, under this
896
+ License to anyone who comes into possession of a copy. This
897
+ License will therefore apply, along with any applicable section 7
898
+ additional terms, to the whole of the work, and all its parts,
899
+ regardless of how they are packaged. This License gives no
900
+ permission to license the work in any other way, but it does not
901
+ invalidate such permission if you have separately received it.
902
+
903
+ d) If the work has interactive user interfaces, each must display
904
+ Appropriate Legal Notices; however, if the Program has interactive
905
+ interfaces that do not display Appropriate Legal Notices, your
906
+ work need not make them do so.
907
+
908
+ A compilation of a covered work with other separate and independent
909
+ works, which are not by their nature extensions of the covered work,
910
+ and which are not combined with it such as to form a larger program,
911
+ in or on a volume of a storage or distribution medium, is called an
912
+ "aggregate" if the compilation and its resulting copyright are not
913
+ used to limit the access or legal rights of the compilation's users
914
+ beyond what the individual works permit. Inclusion of a covered work
915
+ in an aggregate does not cause this License to apply to the other
916
+ parts of the aggregate.
917
+
918
+ 6. Conveying Non-Source Forms.
919
+
920
+ You may convey a covered work in object code form under the terms
921
+ of sections 4 and 5, provided that you also convey the
922
+ machine-readable Corresponding Source under the terms of this License,
923
+ in one of these ways:
924
+
925
+ a) Convey the object code in, or embodied in, a physical product
926
+ (including a physical distribution medium), accompanied by the
927
+ Corresponding Source fixed on a durable physical medium
928
+ customarily used for software interchange.
929
+
930
+ b) Convey the object code in, or embodied in, a physical product
931
+ (including a physical distribution medium), accompanied by a
932
+ written offer, valid for at least three years and valid for as
933
+ long as you offer spare parts or customer support for that product
934
+ model, to give anyone who possesses the object code either (1) a
935
+ copy of the Corresponding Source for all the software in the
936
+ product that is covered by this License, on a durable physical
937
+ medium customarily used for software interchange, for a price no
938
+ more than your reasonable cost of physically performing this
939
+ conveying of source, or (2) access to copy the
940
+ Corresponding Source from a network server at no charge.
941
+
942
+ c) Convey individual copies of the object code with a copy of the
943
+ written offer to provide the Corresponding Source. This
944
+ alternative is allowed only occasionally and noncommercially, and
945
+ only if you received the object code with such an offer, in accord
946
+ with subsection 6b.
947
+
948
+ d) Convey the object code by offering access from a designated
949
+ place (gratis or for a charge), and offer equivalent access to the
950
+ Corresponding Source in the same way through the same place at no
951
+ further charge. You need not require recipients to copy the
952
+ Corresponding Source along with the object code. If the place to
953
+ copy the object code is a network server, the Corresponding Source
954
+ may be on a different server (operated by you or a third party)
955
+ that supports equivalent copying facilities, provided you maintain
956
+ clear directions next to the object code saying where to find the
957
+ Corresponding Source. Regardless of what server hosts the
958
+ Corresponding Source, you remain obligated to ensure that it is
959
+ available for as long as needed to satisfy these requirements.
960
+
961
+ e) Convey the object code using peer-to-peer transmission, provided
962
+ you inform other peers where the object code and Corresponding
963
+ Source of the work are being offered to the general public at no
964
+ charge under subsection 6d.
965
+
966
+ A separable portion of the object code, whose source code is excluded
967
+ from the Corresponding Source as a System Library, need not be
968
+ included in conveying the object code work.
969
+
970
+ A "User Product" is either (1) a "consumer product", which means any
971
+ tangible personal property which is normally used for personal, family,
972
+ or household purposes, or (2) anything designed or sold for incorporation
973
+ into a dwelling. In determining whether a product is a consumer product,
974
+ doubtful cases shall be resolved in favor of coverage. For a particular
975
+ product received by a particular user, "normally used" refers to a
976
+ typical or common use of that class of product, regardless of the status
977
+ of the particular user or of the way in which the particular user
978
+ actually uses, or expects or is expected to use, the product. A product
979
+ is a consumer product regardless of whether the product has substantial
980
+ commercial, industrial or non-consumer uses, unless such uses represent
981
+ the only significant mode of use of the product.
982
+
983
+ "Installation Information" for a User Product means any methods,
984
+ procedures, authorization keys, or other information required to install
985
+ and execute modified versions of a covered work in that User Product from
986
+ a modified version of its Corresponding Source. The information must
987
+ suffice to ensure that the continued functioning of the modified object
988
+ code is in no case prevented or interfered with solely because
989
+ modification has been made.
990
+
991
+ If you convey an object code work under this section in, or with, or
992
+ specifically for use in, a User Product, and the conveying occurs as
993
+ part of a transaction in which the right of possession and use of the
994
+ User Product is transferred to the recipient in perpetuity or for a
995
+ fixed term (regardless of how the transaction is characterized), the
996
+ Corresponding Source conveyed under this section must be accompanied
997
+ by the Installation Information. But this requirement does not apply
998
+ if neither you nor any third party retains the ability to install
999
+ modified object code on the User Product (for example, the work has
1000
+ been installed in ROM).
1001
+
1002
+ The requirement to provide Installation Information does not include a
1003
+ requirement to continue to provide support service, warranty, or updates
1004
+ for a work that has been modified or installed by the recipient, or for
1005
+ the User Product in which it has been modified or installed. Access to a
1006
+ network may be denied when the modification itself materially and
1007
+ adversely affects the operation of the network or violates the rules and
1008
+ protocols for communication across the network.
1009
+
1010
+ Corresponding Source conveyed, and Installation Information provided,
1011
+ in accord with this section must be in a format that is publicly
1012
+ documented (and with an implementation available to the public in
1013
+ source code form), and must require no special password or key for
1014
+ unpacking, reading or copying.
1015
+
1016
+ 7. Additional Terms.
1017
+
1018
+ "Additional permissions" are terms that supplement the terms of this
1019
+ License by making exceptions from one or more of its conditions.
1020
+ Additional permissions that are applicable to the entire Program shall
1021
+ be treated as though they were included in this License, to the extent
1022
+ that they are valid under applicable law. If additional permissions
1023
+ apply only to part of the Program, that part may be used separately
1024
+ under those permissions, but the entire Program remains governed by
1025
+ this License without regard to the additional permissions.
1026
+
1027
+ When you convey a copy of a covered work, you may at your option
1028
+ remove any additional permissions from that copy, or from any part of
1029
+ it. (Additional permissions may be written to require their own
1030
+ removal in certain cases when you modify the work.) You may place
1031
+ additional permissions on material, added by you to a covered work,
1032
+ for which you have or can give appropriate copyright permission.
1033
+
1034
+ Notwithstanding any other provision of this License, for material you
1035
+ add to a covered work, you may (if authorized by the copyright holders of
1036
+ that material) supplement the terms of this License with terms:
1037
+
1038
+ a) Disclaiming warranty or limiting liability differently from the
1039
+ terms of sections 15 and 16 of this License; or
1040
+
1041
+ b) Requiring preservation of specified reasonable legal notices or
1042
+ author attributions in that material or in the Appropriate Legal
1043
+ Notices displayed by works containing it; or
1044
+
1045
+ c) Prohibiting misrepresentation of the origin of that material, or
1046
+ requiring that modified versions of such material be marked in
1047
+ reasonable ways as different from the original version; or
1048
+
1049
+ d) Limiting the use for publicity purposes of names of licensors or
1050
+ authors of the material; or
1051
+
1052
+ e) Declining to grant rights under trademark law for use of some
1053
+ trade names, trademarks, or service marks; or
1054
+
1055
+ f) Requiring indemnification of licensors and authors of that
1056
+ material by anyone who conveys the material (or modified versions of
1057
+ it) with contractual assumptions of liability to the recipient, for
1058
+ any liability that these contractual assumptions directly impose on
1059
+ those licensors and authors.
1060
+
1061
+ All other non-permissive additional terms are considered "further
1062
+ restrictions" within the meaning of section 10. If the Program as you
1063
+ received it, or any part of it, contains a notice stating that it is
1064
+ governed by this License along with a term that is a further
1065
+ restriction, you may remove that term. If a license document contains
1066
+ a further restriction but permits relicensing or conveying under this
1067
+ License, you may add to a covered work material governed by the terms
1068
+ of that license document, provided that the further restriction does
1069
+ not survive such relicensing or conveying.
1070
+
1071
+ If you add terms to a covered work in accord with this section, you
1072
+ must place, in the relevant source files, a statement of the
1073
+ additional terms that apply to those files, or a notice indicating
1074
+ where to find the applicable terms.
1075
+
1076
+ Additional terms, permissive or non-permissive, may be stated in the
1077
+ form of a separately written license, or stated as exceptions;
1078
+ the above requirements apply either way.
1079
+
1080
+ 8. Termination.
1081
+
1082
+ You may not propagate or modify a covered work except as expressly
1083
+ provided under this License. Any attempt otherwise to propagate or
1084
+ modify it is void, and will automatically terminate your rights under
1085
+ this License (including any patent licenses granted under the third
1086
+ paragraph of section 11).
1087
+
1088
+ However, if you cease all violation of this License, then your
1089
+ license from a particular copyright holder is reinstated (a)
1090
+ provisionally, unless and until the copyright holder explicitly and
1091
+ finally terminates your license, and (b) permanently, if the copyright
1092
+ holder fails to notify you of the violation by some reasonable means
1093
+ prior to 60 days after the cessation.
1094
+
1095
+ Moreover, your license from a particular copyright holder is
1096
+ reinstated permanently if the copyright holder notifies you of the
1097
+ violation by some reasonable means, this is the first time you have
1098
+ received notice of violation of this License (for any work) from that
1099
+ copyright holder, and you cure the violation prior to 30 days after
1100
+ your receipt of the notice.
1101
+
1102
+ Termination of your rights under this section does not terminate the
1103
+ licenses of parties who have received copies or rights from you under
1104
+ this License. If your rights have been terminated and not permanently
1105
+ reinstated, you do not qualify to receive new licenses for the same
1106
+ material under section 10.
1107
+
1108
+ 9. Acceptance Not Required for Having Copies.
1109
+
1110
+ You are not required to accept this License in order to receive or
1111
+ run a copy of the Program. Ancillary propagation of a covered work
1112
+ occurring solely as a consequence of using peer-to-peer transmission
1113
+ to receive a copy likewise does not require acceptance. However,
1114
+ nothing other than this License grants you permission to propagate or
1115
+ modify any covered work. These actions infringe copyright if you do
1116
+ not accept this License. Therefore, by modifying or propagating a
1117
+ covered work, you indicate your acceptance of this License to do so.
1118
+
1119
+ 10. Automatic Licensing of Downstream Recipients.
1120
+
1121
+ Each time you convey a covered work, the recipient automatically
1122
+ receives a license from the original licensors, to run, modify and
1123
+ propagate that work, subject to this License. You are not responsible
1124
+ for enforcing compliance by third parties with this License.
1125
+
1126
+ An "entity transaction" is a transaction transferring control of an
1127
+ organization, or substantially all assets of one, or subdividing an
1128
+ organization, or merging organizations. If propagation of a covered
1129
+ work results from an entity transaction, each party to that
1130
+ transaction who receives a copy of the work also receives whatever
1131
+ licenses to the work the party's predecessor in interest had or could
1132
+ give under the previous paragraph, plus a right to possession of the
1133
+ Corresponding Source of the work from the predecessor in interest, if
1134
+ the predecessor has it or can get it with reasonable efforts.
1135
+
1136
+ You may not impose any further restrictions on the exercise of the
1137
+ rights granted or affirmed under this License. For example, you may
1138
+ not impose a license fee, royalty, or other charge for exercise of
1139
+ rights granted under this License, and you may not initiate litigation
1140
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
1141
+ any patent claim is infringed by making, using, selling, offering for
1142
+ sale, or importing the Program or any portion of it.
1143
+
1144
+ 11. Patents.
1145
+
1146
+ A "contributor" is a copyright holder who authorizes use under this
1147
+ License of the Program or a work on which the Program is based. The
1148
+ work thus licensed is called the contributor's "contributor version".
1149
+
1150
+ A contributor's "essential patent claims" are all patent claims
1151
+ owned or controlled by the contributor, whether already acquired or
1152
+ hereafter acquired, that would be infringed by some manner, permitted
1153
+ by this License, of making, using, or selling its contributor version,
1154
+ but do not include claims that would be infringed only as a
1155
+ consequence of further modification of the contributor version. For
1156
+ purposes of this definition, "control" includes the right to grant
1157
+ patent sublicenses in a manner consistent with the requirements of
1158
+ this License.
1159
+
1160
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
1161
+ patent license under the contributor's essential patent claims, to
1162
+ make, use, sell, offer for sale, import and otherwise run, modify and
1163
+ propagate the contents of its contributor version.
1164
+
1165
+ In the following three paragraphs, a "patent license" is any express
1166
+ agreement or commitment, however denominated, not to enforce a patent
1167
+ (such as an express permission to practice a patent or covenant not to
1168
+ sue for patent infringement). To "grant" such a patent license to a
1169
+ party means to make such an agreement or commitment not to enforce a
1170
+ patent against the party.
1171
+
1172
+ If you convey a covered work, knowingly relying on a patent license,
1173
+ and the Corresponding Source of the work is not available for anyone
1174
+ to copy, free of charge and under the terms of this License, through a
1175
+ publicly available network server or other readily accessible means,
1176
+ then you must either (1) cause the Corresponding Source to be so
1177
+ available, or (2) arrange to deprive yourself of the benefit of the
1178
+ patent license for this particular work, or (3) arrange, in a manner
1179
+ consistent with the requirements of this License, to extend the patent
1180
+ license to downstream recipients. "Knowingly relying" means you have
1181
+ actual knowledge that, but for the patent license, your conveying the
1182
+ covered work in a country, or your recipient's use of the covered work
1183
+ in a country, would infringe one or more identifiable patents in that
1184
+ country that you have reason to believe are valid.
1185
+
1186
+ If, pursuant to or in connection with a single transaction or
1187
+ arrangement, you convey, or propagate by procuring conveyance of, a
1188
+ covered work, and grant a patent license to some of the parties
1189
+ receiving the covered work authorizing them to use, propagate, modify
1190
+ or convey a specific copy of the covered work, then the patent license
1191
+ you grant is automatically extended to all recipients of the covered
1192
+ work and works based on it.
1193
+
1194
+ A patent license is "discriminatory" if it does not include within
1195
+ the scope of its coverage, prohibits the exercise of, or is
1196
+ conditioned on the non-exercise of one or more of the rights that are
1197
+ specifically granted under this License. You may not convey a covered
1198
+ work if you are a party to an arrangement with a third party that is
1199
+ in the business of distributing software, under which you make payment
1200
+ to the third party based on the extent of your activity of conveying
1201
+ the work, and under which the third party grants, to any of the
1202
+ parties who would receive the covered work from you, a discriminatory
1203
+ patent license (a) in connection with copies of the covered work
1204
+ conveyed by you (or copies made from those copies), or (b) primarily
1205
+ for and in connection with specific products or compilations that
1206
+ contain the covered work, unless you entered into that arrangement,
1207
+ or that patent license was granted, prior to 28 March 2007.
1208
+
1209
+ Nothing in this License shall be construed as excluding or limiting
1210
+ any implied license or other defenses to infringement that may
1211
+ otherwise be available to you under applicable patent law.
1212
+
1213
+ 12. No Surrender of Others' Freedom.
1214
+
1215
+ If conditions are imposed on you (whether by court order, agreement or
1216
+ otherwise) that contradict the conditions of this License, they do not
1217
+ excuse you from the conditions of this License. If you cannot convey a
1218
+ covered work so as to satisfy simultaneously your obligations under this
1219
+ License and any other pertinent obligations, then as a consequence you may
1220
+ not convey it at all. For example, if you agree to terms that obligate you
1221
+ to collect a royalty for further conveying from those to whom you convey
1222
+ the Program, the only way you could satisfy both those terms and this
1223
+ License would be to refrain entirely from conveying the Program.
1224
+
1225
+ 13. Use with the GNU Affero General Public License.
1226
+
1227
+ Notwithstanding any other provision of this License, you have
1228
+ permission to link or combine any covered work with a work licensed
1229
+ under version 3 of the GNU Affero General Public License into a single
1230
+ combined work, and to convey the resulting work. The terms of this
1231
+ License will continue to apply to the part which is the covered work,
1232
+ but the special requirements of the GNU Affero General Public License,
1233
+ section 13, concerning interaction through a network will apply to the
1234
+ combination as such.
1235
+
1236
+ 14. Revised Versions of this License.
1237
+
1238
+ The Free Software Foundation may publish revised and/or new versions of
1239
+ the GNU General Public License from time to time. Such new versions will
1240
+ be similar in spirit to the present version, but may differ in detail to
1241
+ address new problems or concerns.
1242
+
1243
+ Each version is given a distinguishing version number. If the
1244
+ Program specifies that a certain numbered version of the GNU General
1245
+ Public License "or any later version" applies to it, you have the
1246
+ option of following the terms and conditions either of that numbered
1247
+ version or of any later version published by the Free Software
1248
+ Foundation. If the Program does not specify a version number of the
1249
+ GNU General Public License, you may choose any version ever published
1250
+ by the Free Software Foundation.
1251
+
1252
+ If the Program specifies that a proxy can decide which future
1253
+ versions of the GNU General Public License can be used, that proxy's
1254
+ public statement of acceptance of a version permanently authorizes you
1255
+ to choose that version for the Program.
1256
+
1257
+ Later license versions may give you additional or different
1258
+ permissions. However, no additional obligations are imposed on any
1259
+ author or copyright holder as a result of your choosing to follow a
1260
+ later version.
1261
+
1262
+ 15. Disclaimer of Warranty.
1263
+
1264
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
1265
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
1266
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
1267
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
1268
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
1269
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
1270
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
1271
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
1272
+
1273
+ 16. Limitation of Liability.
1274
+
1275
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
1276
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
1277
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
1278
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
1279
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
1280
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
1281
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
1282
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
1283
+ SUCH DAMAGES.
1284
+
1285
+ 17. Interpretation of Sections 15 and 16.
1286
+
1287
+ If the disclaimer of warranty and limitation of liability provided
1288
+ above cannot be given local legal effect according to their terms,
1289
+ reviewing courts shall apply local law that most closely approximates
1290
+ an absolute waiver of all civil liability in connection with the
1291
+ Program, unless a warranty or assumption of liability accompanies a
1292
+ copy of the Program in return for a fee.
1293
+
1294
+ END OF TERMS AND CONDITIONS
1295
+
1296
+ How to Apply These Terms to Your New Programs
1297
+
1298
+ If you develop a new program, and you want it to be of the greatest
1299
+ possible use to the public, the best way to achieve this is to make it
1300
+ free software which everyone can redistribute and change under these terms.
1301
+
1302
+ To do so, attach the following notices to the program. It is safest
1303
+ to attach them to the start of each source file to most effectively
1304
+ state the exclusion of warranty; and each file should have at least
1305
+ the "copyright" line and a pointer to where the full notice is found.
1306
+
1307
+ <one line to give the program's name and a brief idea of what it does.>
1308
+ Copyright (C) <year> <name of author>
1309
+
1310
+ This program is free software: you can redistribute it and/or modify
1311
+ it under the terms of the GNU General Public License as published by
1312
+ the Free Software Foundation, either version 3 of the License, or
1313
+ (at your option) any later version.
1314
+
1315
+ This program is distributed in the hope that it will be useful,
1316
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
1317
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1318
+ GNU General Public License for more details.
1319
+
1320
+ You should have received a copy of the GNU General Public License
1321
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
1322
+
1323
+ Also add information on how to contact you by electronic and paper mail.
1324
+
1325
+ If the program does terminal interaction, make it output a short
1326
+ notice like this when it starts in an interactive mode:
1327
+
1328
+ <program> Copyright (C) <year> <name of author>
1329
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
1330
+ This is free software, and you are welcome to redistribute it
1331
+ under certain conditions; type `show c' for details.
1332
+
1333
+ The hypothetical commands `show w' and `show c' should show the appropriate
1334
+ parts of the General Public License. Of course, your program's commands
1335
+ might be different; for a GUI interface, you would use an "about box".
1336
+
1337
+ You should also get your employer (if you work as a programmer) or school,
1338
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
1339
+ For more information on this, and how to apply and follow the GNU GPL, see
1340
+ <https://www.gnu.org/licenses/>.
1341
+
1342
+ The GNU General Public License does not permit incorporating your program
1343
+ into proprietary programs. If your program is a subroutine library, you
1344
+ may consider it more useful to permit linking proprietary applications with
1345
+ the library. If this is what you want to do, use the GNU Lesser General
1346
+ Public License instead of this License. But first, please read
1347
+ <https://www.gnu.org/licenses/why-not-lgpl.html>.
YOLO-World/README.md ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <img src="./assets/yolo_logo.png" width=60%>
3
+ <br>
4
+ <a href="https://scholar.google.com/citations?hl=zh-CN&user=PH8rJHYAAAAJ">Tianheng Cheng</a><sup><span>2,3,*</span></sup>,
5
+ <a href="https://linsong.info/">Lin Song</a><sup><span>1,📧,*</span></sup>,
6
+ <a href="https://yxgeee.github.io/">Yixiao Ge</a><sup><span>1,🌟,2</span></sup>,
7
+ <a href="http://eic.hust.edu.cn/professor/liuwenyu/"> Wenyu Liu</a><sup><span>3</span></sup>,
8
+ <a href="https://xwcv.github.io/">Xinggang Wang</a><sup><span>3,📧</span></sup>,
9
+ <a href="https://scholar.google.com/citations?user=4oXBp9UAAAAJ&hl=en">Ying Shan</a><sup><span>1,2</span></sup>
10
+ </br>
11
+
12
+ \* Equal contribution 🌟 Project lead 📧 Corresponding author
13
+
14
+ <sup>1</sup> Tencent AI Lab, <sup>2</sup> ARC Lab, Tencent PCG
15
+ <sup>3</sup> Huazhong University of Science and Technology
16
+ <br>
17
+ <div>
18
+
19
+ [![arxiv paper](https://img.shields.io/badge/Project-Page-green)](https://wondervictor.github.io/)
20
+ [![arxiv paper](https://img.shields.io/badge/arXiv-Paper-red)](https://arxiv.org/abs/2401.17270)
21
+ <a href="https://colab.research.google.com/github/AILab-CVC/YOLO-World/blob/master/inference.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
22
+ [![demo](https://img.shields.io/badge/🤗HugginngFace-Spaces-orange)](https://huggingface.co/spaces/stevengrove/YOLO-World)
23
+ [![Replicate](https://replicate.com/zsxkib/yolo-world/badge)](https://replicate.com/zsxkib/yolo-world)
24
+ [![hfpaper](https://img.shields.io/badge/🤗HugginngFace-Paper-yellow)](https://huggingface.co/papers/2401.17270)
25
+ [![license](https://img.shields.io/badge/License-GPLv3.0-blue)](LICENSE)
26
+ [![yoloworldseg](https://img.shields.io/badge/YOLOWorldxEfficientSAM-🤗Spaces-orange)](https://huggingface.co/spaces/SkalskiP/YOLO-World)
27
+ [![yologuide](https://img.shields.io/badge/📖Notebook-roboflow-purple)](https://supervision.roboflow.com/develop/notebooks/zero-shot-object-detection-with-yolo-world)
28
+ [![deploy](https://media.roboflow.com/deploy.svg)](https://inference.roboflow.com/foundation/yolo_world/)
29
+
30
+ </div>
31
+ </div>
32
+
33
+ ## Notice
34
+
35
+ **YOLO-World is still under active development!**
36
+
37
+ We recommend that everyone **use English to communicate on issues**, as this helps developers from around the world discuss, share experiences, and answer questions together.
38
+
39
+ For business licensing and other related inquiries, don't hesitate to contact `[email protected]`.
40
+
41
+ ## 🔥 Updates
42
+ `[2024-11-5]`: We update the `YOLO-World-Image` and you can try it at HuggingFace [YOLO-World-Image (Preview Version)](https://huggingface.co/spaces/wondervictor/YOLO-World-Image). It's a *preview* version and we are still improving it! Detailed documents about training and few-shot inference are coming soon.\
43
+ `[2024-7-8]`: YOLO-World now has been integrated into [ComfyUI](https://github.com/StevenGrove/ComfyUI-YOLOWorld)! Come and try adding YOLO-World to your workflow now! You can access it at [StevenGrove/ComfyUI-YOLOWorld](https://github.com/StevenGrove/ComfyUI-YOLOWorld)!
44
+ `[2024-5-18]:` YOLO-World models have been [integrated with the FiftyOne computer vision toolkit](https://docs.voxel51.com/integrations/ultralytics.html#open-vocabulary-detection) for streamlined open-vocabulary inference across image and video datasets.
45
+ `[2024-5-16]:` Hey guys! Long time no see! This update contains (1) [fine-tuning guide](https://github.com/AILab-CVC/YOLO-World?#highlights--introduction) and (2) [TFLite Export](./docs/tflite_deploy.md) with INT8 Quantization.
46
+ `[2024-5-9]:` This update contains the real [`reparameterization`](./docs/reparameterize.md) 🪄, and it's better for fine-tuning on custom datasets and improves the training/inference efficiency 🚀!
47
+ `[2024-4-28]:` Long time no see! This update contains bugfixs and improvements: (1) ONNX demo; (2) image demo (support tensor input); (2) new pre-trained models; (3) image prompts; (4) simple version for fine-tuning / deployment; (5) guide for installation (include a `requirements.txt`).
48
+ `[2024-3-28]:` We provide: (1) more high-resolution pre-trained models (e.g., S, M, X) ([#142](https://github.com/AILab-CVC/YOLO-World/issues/142)); (2) pre-trained models with CLIP-Large text encoders. Most importantly, we preliminarily fix the **fine-tuning without `mask-refine`** and explore a new fine-tuning setting ([#160](https://github.com/AILab-CVC/YOLO-World/issues/160),[#76](https://github.com/AILab-CVC/YOLO-World/issues/76)). In addition, fine-tuning YOLO-World with `mask-refine` also obtains significant improvements, check more details in [configs/finetune_coco](./configs/finetune_coco/).
49
+ `[2024-3-16]:` We fix the bugs about the demo ([#110](https://github.com/AILab-CVC/YOLO-World/issues/110),[#94](https://github.com/AILab-CVC/YOLO-World/issues/94),[#129](https://github.com/AILab-CVC/YOLO-World/issues/129), [#125](https://github.com/AILab-CVC/YOLO-World/issues/125)) with visualizations of segmentation masks, and release [**YOLO-World with Embeddings**](./docs/prompt_yolo_world.md), which supports prompt tuning, text prompts and image prompts.
50
+ `[2024-3-3]:` We add the **high-resolution YOLO-World**, which supports `1280x1280` resolution with higher accuracy and better performance for small objects!
51
+ `[2024-2-29]:` We release the newest version of [ **YOLO-World-v2**](./docs/updates.md) with higher accuracy and faster speed! We hope the community can join us to improve YOLO-World!
52
+ `[2024-2-28]:` Excited to announce that YOLO-World has been accepted by **CVPR 2024**! We're continuing to make YOLO-World faster and stronger, as well as making it better to use for all.
53
+ `[2024-2-22]:` We sincerely thank [RoboFlow](https://roboflow.com/) and [@Skalskip92](https://twitter.com/skalskip92) for the [**Video Guide**](https://www.youtube.com/watch?v=X7gKBGVz4vs) about YOLO-World, nice work!
54
+ `[2024-2-18]:` We thank [@Skalskip92](https://twitter.com/skalskip92) for developing the wonderful segmentation demo via connecting YOLO-World and EfficientSAM. You can try it now at the [🤗 HuggingFace Spaces](https://huggingface.co/spaces/SkalskiP/YOLO-World).
55
+ `[2024-2-17]:` The largest model **X** of YOLO-World is released, which achieves better zero-shot performance!
56
+ `[2024-2-17]:` We release the code & models for **YOLO-World-Seg** now! YOLO-World now supports open-vocabulary / zero-shot object segmentation!
57
+ `[2024-2-15]:` The pre-traind YOLO-World-L with CC3M-Lite is released!
58
+ `[2024-2-14]:` We provide the [`image_demo`](demo.py) for inference on images or directories.
59
+ `[2024-2-10]:` We provide the [fine-tuning](./docs/finetuning.md) and [data](./docs/data.md) details for fine-tuning YOLO-World on the COCO dataset or the custom datasets!
60
+ `[2024-2-3]:` We support the `Gradio` demo now in the repo and you can build the YOLO-World demo on your own device!
61
+ `[2024-2-1]:` We've released the code and weights of YOLO-World now!
62
+ `[2024-2-1]:` We deploy the YOLO-World demo on [HuggingFace 🤗](https://huggingface.co/spaces/stevengrove/YOLO-World), you can try it now!
63
+ `[2024-1-31]:` We are excited to launch **YOLO-World**, a cutting-edge real-time open-vocabulary object detector.
64
+
65
+
66
+ ## TODO
67
+
68
+ YOLO-World is under active development and please stay tuned ☕️!
69
+ If you have suggestions📃 or ideas💡,**we would love for you to bring them up in the [Roadmap](https://github.com/AILab-CVC/YOLO-World/issues/109)** ❤️!
70
+ > YOLO-World 目前正在积极开发中📃,如果你有建议或者想法💡,**我们非常希望您在 [Roadmap](https://github.com/AILab-CVC/YOLO-World/issues/109) 中提出来** ❤️!
71
+
72
+ ## [FAQ (Frequently Asked Questions)](https://github.com/AILab-CVC/YOLO-World/discussions/149)
73
+
74
+ We have set up an FAQ about YOLO-World in the discussion on GitHub. We hope everyone can raise issues or solutions during use here, and we also hope that everyone can quickly find solutions from it.
75
+
76
+ > 我们在GitHub的discussion中建立了关于YOLO-World的常见问答,这里将收集一些常见问题,同时大家可以在此提出使用中的问题或者解决方案,也希望大家能够从中快速寻找到解决方案
77
+
78
+
79
+ ## Highlights & Introduction
80
+
81
+ This repo contains the PyTorch implementation, pre-trained weights, and pre-training/fine-tuning code for YOLO-World.
82
+
83
+ * YOLO-World is pre-trained on large-scale datasets, including detection, grounding, and image-text datasets.
84
+
85
+ * YOLO-World is the next-generation YOLO detector, with a strong open-vocabulary detection capability and grounding ability.
86
+
87
+ * YOLO-World presents a *prompt-then-detect* paradigm for efficient user-vocabulary inference, which re-parameterizes vocabulary embeddings as parameters into the model and achieve superior inference speed. You can try to export your own detection model without extra training or fine-tuning in our [online demo](https://huggingface.co/spaces/stevengrove/YOLO-World)!
88
+
89
+
90
+ <div align="center">
91
+ <img width=800px src="./assets/yolo_arch.png">
92
+ </div>
93
+ ## Model Zoo
94
+
95
+ We've pre-trained YOLO-World-S/M/L from scratch and evaluate on the `LVIS val-1.0` and `LVIS minival`. We provide the pre-trained model weights and training logs for applications/research or re-producing the results.
96
+
97
+ ### Zero-shot Inference on LVIS dataset
98
+
99
+ <div><font size=2>
100
+
101
+ | model | Pre-train Data | Size | AP<sup>mini</su> | AP<sub>r</sub> | AP<sub>c</sub> | AP<sub>f</sub> | AP<sup>val</su> | AP<sub>r</sub> | AP<sub>c</sub> | AP<sub>f</sub> | weights |
102
+ | :------------------------------------------------------------------------------------------------------------------- | :------------------- | :----------------- | :--------------: | :------------: | :------------: | :------------: | :-------------: | :------------: | :------------: | :------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
103
+ | [YOLO-Worldv2-S](./configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 22.7 | 16.3 | 20.8 | 25.5 | 17.3 | 11.3 | 14.9 | 22.7 |[HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_obj365v1_goldg_pretrain-55b943ea.pth)|
104
+ | [YOLO-Worldv2-S](./configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) | O365+GoldG | 1280&#x1F538; | 24.1 | 18.7 | 22.0 | 26.9 | 18.8 | 14.1 | 16.3 | 23.8 |[HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_obj365v1_goldg_pretrain_1280ft-fc4ff4f7.pth)|
105
+ | [YOLO-Worldv2-M](./configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 30.0 | 25.0 | 27.2 | 33.4 | 23.5 | 17.1 | 20.0 | 30.1 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_obj365v1_goldg_pretrain-c6237d5b.pth)|
106
+ | [YOLO-Worldv2-M](./configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) | O365+GoldG | 1280&#x1F538; | 31.6 | 24.5 | 29.0 | 35.1 | 25.3 | 19.3 | 22.0 | 31.7 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_obj365v1_goldg_pretrain_1280ft-77d0346d.pth)|
107
+ | [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 33.0 | 22.6 | 32.0 | 35.8 | 26.0 | 18.6 | 23.0 | 32.6 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth)|
108
+ | 🔥 [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CCLiteV2 | 640 | 33.4 | 23.1 | 31.9 | 36.6 | 26.6 | 20.3 | 23.2 | 33.2 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_v2_l_obj365v1_goldg_cc3mv2_pretrain-2f3a4a22.pth)|
109
+ | [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) | O365+GoldG | 1280&#x1F538; | 34.6 | 29.2 | 32.8 | 37.2 | 27.6 | 21.9 | 24.2 | 34.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_pretrain_1280ft-9babe3f6.pth)|
110
+ | [YOLO-Worldv2-L (CLIP-Large)](./configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG | 640 | 34.0 | 22.0 | 32.6 | 37.4 | 27.1 | 19.9 | 23.9 | 33.9 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_clip_large_o365v1_goldg_pretrain-8ff2e744.pth)|
111
+ | [YOLO-Worldv2-L (CLIP-Large)](./configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py) 🔥 | O365+GoldG | 800&#x1F538; | 35.5 | 28.3 | 33.2 | 38.8 | 28.6 | 22.0 | 25.1 | 35.4 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_clip_large_o365v1_goldg_pretrain_800ft-9df82e55.pth)|
112
+ | [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 32.9 | 25.3 | 31.1 | 35.8 | 26.1 | 20.6 | 22.6 | 32.3 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth)|
113
+ | [YOLO-Worldv2-X](./configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 35.4 | 28.7 | 32.9 | 38.7 | 28.4 | 20.6 | 25.6 | 35.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth) |
114
+ | 🔥 [YOLO-Worldv2-X]() | O365+GoldG+CC3M-Lite | 1280&#x1F538; | 37.4 | 30.5 | 35.2 | 40.7 | 29.8 | 21.1 | 26.8 | 37.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth) |
115
+ | [YOLO-Worldv2-XL](./configs/pretrain/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 36.0 | 25.8 | 34.1 | 39.5 | 29.1 | 21.1 | 26.3 | 35.8 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_xl_obj365v1_goldg_cc3mlite_pretrain-5daf1395.pth) |
116
+
117
+ </font>
118
+ </div>
119
+
120
+ **NOTE:**
121
+ 1. AP<sup>mini</sup>: evaluated on LVIS `minival`.
122
+ 3. AP<sup>val</sup>: evaluated on LVIS `val 1.0`.
123
+ 4. [HuggingFace Mirror](https://hf-mirror.com/) provides the mirror of HuggingFace, which is a choice for users who are unable to reach.
124
+ 5. &#x1F538;: fine-tuning models with the pre-trained data.
125
+
126
+ **Pre-training Logs:**
127
+
128
+ We provide the pre-training logs of `YOLO-World-v2`. Due to the unexpected errors of the local machines, the training might be interrupted several times.
129
+
130
+ | Model | YOLO-World-v2-S | YOLO-World-v2-M | YOLO-World-v2-L | YOLO-World-v2-X |
131
+ | :--- | :-------------: | :--------------: | :-------------: | :-------------: |
132
+ |Pre-training Log | [Part-1](https://drive.google.com/file/d/1oib7pKfA2h1U_5-85H_s0Nz8jWd0R-WP/view?usp=drive_link), [Part-2](https://drive.google.com/file/d/11cZ6OZy80VTvBlZy3kzLAHCxx5Iix5-n/view?usp=drive_link) | [Part-1](https://drive.google.com/file/d/1E6vYSS8kBipGc8oQnsjAfeUAx8I9yOX7/view?usp=drive_link), [Part-2](https://drive.google.com/file/d/1fbM7vt2tgSeB8o_7tUDofWvpPNSViNj5/view?usp=drive_link) | [Part-1](https://drive.google.com/file/d/1Tola1QGJZTL6nGy3SBxKuknfNfREDm8J/view?usp=drive_link), [Part-2](https://drive.google.com/file/d/1mTBXniioUb0CdctCG4ckIU6idGo0NnH8/view?usp=drive_link) | [Final part](https://drive.google.com/file/d/1aEUA_EPQbXOrpxHTQYB6ieGXudb1PLpd/view?usp=drive_link)|
133
+
134
+
135
+ ## Getting started
136
+
137
+ ### 1. Installation
138
+
139
+ YOLO-World is developed based on `torch==1.11.0` `mmyolo==0.6.0` and `mmdetection==3.0.0`. Check more details about `requirements` and `mmcv` in [docs/installation](./docs/installation.md).
140
+
141
+ #### Clone Project
142
+
143
+ ```bash
144
+ git clone --recursive https://github.com/AILab-CVC/YOLO-World.git
145
+ ```
146
+ #### Install
147
+
148
+ ```bash
149
+ pip install torch wheel -q
150
+ pip install -e .
151
+ ```
152
+
153
+ ### 2. Preparing Data
154
+
155
+ We provide the details about the pre-training data in [docs/data](./docs/data.md).
156
+
157
+
158
+ ## Training & Evaluation
159
+
160
+ We adopt the default [training](./tools/train.py) or [evaluation](./tools/test.py) scripts of [mmyolo](https://github.com/open-mmlab/mmyolo).
161
+ We provide the configs for pre-training and fine-tuning in `configs/pretrain` and `configs/finetune_coco`.
162
+ Training YOLO-World is easy:
163
+
164
+ ```bash
165
+ chmod +x tools/dist_train.sh
166
+ # sample command for pre-training, use AMP for mixed-precision training
167
+ ./tools/dist_train.sh configs/pretrain/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py 8 --amp
168
+ ```
169
+ **NOTE:** YOLO-World is pre-trained on 4 nodes with 8 GPUs per node (32 GPUs in total). For pre-training, the `node_rank` and `nnodes` for multi-node training should be specified.
170
+
171
+ Evaluating YOLO-World is also easy:
172
+
173
+ ```bash
174
+ chmod +x tools/dist_test.sh
175
+ ./tools/dist_test.sh path/to/config path/to/weights 8
176
+ ```
177
+
178
+ **NOTE:** We mainly evaluate the performance on LVIS-minival for pre-training.
179
+
180
+ ## Fine-tuning YOLO-World
181
+
182
+ <div align="center">
183
+ <img src="./assets/finetune_yoloworld.png" width=800px>
184
+ </div>
185
+
186
+
187
+ <div align="center">
188
+ <b><p>Chose your pre-trained YOLO-World and Fine-tune it!</p></b>
189
+ </div>
190
+
191
+
192
+ YOLO-World supports **zero-shot inference**, and three types of **fine-tuning recipes**: **(1) normal fine-tuning**, **(2) prompt tuning**, and **(3) reparameterized fine-tuning**.
193
+
194
+ * Normal Fine-tuning: we provide the details about fine-tuning YOLO-World in [docs/fine-tuning](./docs/finetuning.md).
195
+
196
+ * Prompt Tuning: we provide more details ahout prompt tuning in [docs/prompt_yolo_world](./docs/prompt_yolo_world.md).
197
+
198
+ * Reparameterized Fine-tuning: the reparameterized YOLO-World is more suitable for specific domains far from generic scenes. You can find more details in [docs/reparameterize](./docs/reparameterize.md).
199
+
200
+ ## Deployment
201
+
202
+ We provide the details about deployment for downstream applications in [docs/deployment](./docs/deploy.md).
203
+ You can directly download the ONNX model through the online [demo](https://huggingface.co/spaces/stevengrove/YOLO-World) in Huggingface Spaces 🤗.
204
+
205
+ - [x] ONNX export and demo: [docs/deploy](https://github.com/AILab-CVC/YOLO-World/blob/master/docs/deploy.md)
206
+ - [x] TFLite and INT8 Quantization: [docs/tflite_deploy](https://github.com/AILab-CVC/YOLO-World/blob/master/docs/tflite_deploy.md)
207
+ - [ ] TensorRT: coming soon.
208
+ - [ ] C++: coming soon.
209
+
210
+ ## Demo
211
+
212
+ See [`demo`](./demo) for more details
213
+
214
+ - [x] `gradio_demo.py`: Gradio demo, ONNX export
215
+ - [x] `image_demo.py`: inference with images or a directory of images
216
+ - [x] `simple_demo.py`: a simple demo of YOLO-World, using `array` (instead of path as input).
217
+ - [x] `video_demo.py`: inference YOLO-World on videos.
218
+ - [x] `inference.ipynb`: jupyter notebook for YOLO-World.
219
+ - [x] [Google Colab Notebook](https://colab.research.google.com/drive/1F_7S5lSaFM06irBCZqjhbN7MpUXo6WwO?usp=sharing): We sincerely thank [Onuralp](https://github.com/onuralpszr) for sharing the [Colab Demo](https://colab.research.google.com/drive/1F_7S5lSaFM06irBCZqjhbN7MpUXo6WwO?usp=sharing), you can have a try 😊!
220
+
221
+ ## Acknowledgement
222
+
223
+ We sincerely thank [mmyolo](https://github.com/open-mmlab/mmyolo), [mmdetection](https://github.com/open-mmlab/mmdetection), [GLIP](https://github.com/microsoft/GLIP), and [transformers](https://github.com/huggingface/transformers) for providing their wonderful code to the community!
224
+
225
+ ## Citations
226
+ If you find YOLO-World is useful in your research or applications, please consider giving us a star 🌟 and citing it.
227
+
228
+ ```bibtex
229
+ @inproceedings{Cheng2024YOLOWorld,
230
+ title={YOLO-World: Real-Time Open-Vocabulary Object Detection},
231
+ author={Cheng, Tianheng and Song, Lin and Ge, Yixiao and Liu, Wenyu and Wang, Xinggang and Shan, Ying},
232
+ booktitle={Proc. IEEE Conf. Computer Vision and Pattern Recognition (CVPR)},
233
+ year={2024}
234
+ }
235
+ ```
236
+
237
+ ## Licence
238
+ YOLO-World is under the GPL-v3 Licence and is supported for commercial usage. If you need a commercial license for YOLO-World, please feel free to contact us.
YOLO-World/assets/finetune_yoloworld.png ADDED

Git LFS Details

  • SHA256: b65e93db06eb5f0bcba8531ab30ac0230908340f6a28f55d619e873adfb9b74a
  • Pointer size: 131 Bytes
  • Size of remote file: 478 kB
YOLO-World/assets/reparameterize.png ADDED
YOLO-World/assets/yolo_arch.png ADDED

Git LFS Details

  • SHA256: 4bd0dc3a8c5190e290463499ddaedb40e19d340cd4e180dfc9533b9182dc3d32
  • Pointer size: 131 Bytes
  • Size of remote file: 305 kB
YOLO-World/assets/yolo_logo.png ADDED

Git LFS Details

  • SHA256: 3a21d633d611336a0434daebe0585f60d3046537895e75ae949a768e54acd215
  • Pointer size: 131 Bytes
  • Size of remote file: 102 kB
YOLO-World/configs/finetune_coco/README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Fine-tune YOLO-World on MS-COCO
2
+
3
+
4
+ ### Updates
5
+
6
+ 1. [2024-3-27]: Considering that fine-tuning YOLO-World on COCO **without `mask-refine`** obtains bad results, e.g., YOLO-World-L obtains 48.6 AP without `mask-refine` compared to 53.3 AP with `mask-refine`, we rethink the training process and explore new training schemes for fine-tuning without `mask-refine`.
7
+ BTW, the COCO fine-tuning results are updated with higher performance (with `mask-refine`)!
8
+
9
+
10
+ ### COCO Results and Checkpoints
11
+
12
+ **NOTE:**
13
+ 1. AP<sup>ZS</sup>: AP evaluated in the zero-shot setting (w/o fine-tuning on COCO dataset).
14
+ 2. `mask-refine`: refine the box annotations with masks, and add `CopyPaste` augmentation during training.
15
+
16
+ | model | Schedule | `mask-refine` | efficient neck | AP<sup>ZS</sup>| AP | AP<sub>50</sub> | AP<sub>75</sub> | weights | log |
17
+ | :---- | :-------: | :----------: |:-------------: | :------------: | :-: | :--------------:| :-------------: |:------: | :-: |
18
+ | [YOLO-World-v2-S](./yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 37.5 | 46.1 | 62.0 | 49.9 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-492dc329.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240327_110411.log) |
19
+ | [YOLO-World-v2-M](./yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 42.8 | 51.0 | 67.5 | 55.2 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-69c27ac7.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240327_110411.log) |
20
+ | [YOLO-World-v2-L](./yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 45.1 | 53.9 | 70.9 | 58.8 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-81c701ee.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240326_160313.log) |
21
+ | [YOLO-World-v2-X](./yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 46.8 | 54.7 | 71.6 | 59.6 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-76bc0cbd.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240322_181232.log) |
22
+ | [YOLO-World-v2-L](./yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py) 🔥 | SGD, 1e-3, 40e | ✖️ | ✖️ | 45.1 | 52.8 | 69.5 | 57.8 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco_ep80-e1288152.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetuning_coco_20240327_014902.log) |
23
+
24
+
25
+ ### Reparameterized Training
26
+
27
+ | model | Schedule | `mask-refine` | efficient neck | AP<sup>ZS</sup>| AP | AP<sub>50</sub> | AP<sub>75</sub> | weights | log |
28
+ | :---- | :-------: | :----------: |:-------------: | :------------: | :-: | :--------------:| :-------------: |:------: | :-: |
29
+ | [YOLO-World-v2-S](./yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 37.5 | 46.3 | 62.8 | 50.4 | [HF Checkpoints]() | [log]() |
YOLO-World/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = (
2
+ '../../third_party/mmyolo/configs/yolov8/'
3
+ 'yolov8_l_syncbn_fast_8xb16-500e_coco.py')
4
+ custom_imports = dict(
5
+ imports=['yolo_world'],
6
+ allow_failed_imports=False)
7
+
8
+ # hyper-parameters
9
+ num_classes = 80
10
+ num_training_classes = 80
11
+ max_epochs = 80 # Maximum training epochs
12
+ close_mosaic_epochs = 10
13
+ save_epoch_intervals = 5
14
+ text_channels = 512
15
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
16
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
17
+ base_lr = 2e-4
18
+ weight_decay = 0.05
19
+ train_batch_size_per_gpu = 16
20
+ load_from='pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth'
21
+ persistent_workers = False
22
+
23
+ # model settings
24
+ model = dict(
25
+ type='YOLOWorldDetector',
26
+ mm_neck=True,
27
+ num_train_classes=num_training_classes,
28
+ num_test_classes=num_classes,
29
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
30
+ backbone=dict(
31
+ _delete_=True,
32
+ type='MultiModalYOLOBackbone',
33
+ image_model={{_base_.model.backbone}},
34
+ text_model=dict(
35
+ type='HuggingCLIPLanguageBackbone',
36
+ model_name='openai/clip-vit-base-patch32',
37
+ frozen_modules=['all'])),
38
+ neck=dict(type='YOLOWorldDualPAFPN',
39
+ guide_channels=text_channels,
40
+ embed_channels=neck_embed_channels,
41
+ num_heads=neck_num_heads,
42
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'),
43
+ text_enhancder=dict(type='ImagePoolingAttentionModule',
44
+ embed_channels=256,
45
+ num_heads=8)),
46
+ bbox_head=dict(type='YOLOWorldHead',
47
+ head_module=dict(type='YOLOWorldHeadModule',
48
+ embed_dims=text_channels,
49
+ num_classes=num_training_classes)),
50
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
51
+
52
+ # dataset settings
53
+ text_transform = [
54
+ dict(type='RandomLoadText',
55
+ num_neg_samples=(num_classes, num_classes),
56
+ max_num_samples=num_training_classes,
57
+ padding_to_max=True,
58
+ padding_value=''),
59
+ dict(type='mmdet.PackDetInputs',
60
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
61
+ 'flip_direction', 'texts'))
62
+ ]
63
+ mosaic_affine_transform = [
64
+ dict(
65
+ type='MultiModalMosaic',
66
+ img_scale=_base_.img_scale,
67
+ pad_val=114.0,
68
+ pre_transform=_base_.pre_transform),
69
+ dict(
70
+ type='YOLOv5RandomAffine',
71
+ max_rotate_degree=0.0,
72
+ max_shear_degree=0.0,
73
+ max_aspect_ratio=100.,
74
+ scaling_ratio_range=(1 - _base_.affine_scale,
75
+ 1 + _base_.affine_scale),
76
+ # img_scale is (width, height)
77
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
78
+ border_val=(114, 114, 114))
79
+ ]
80
+ train_pipeline = [
81
+ *_base_.pre_transform,
82
+ *mosaic_affine_transform,
83
+ dict(
84
+ type='YOLOv5MultiModalMixUp',
85
+ prob=_base_.mixup_prob,
86
+ pre_transform=[*_base_.pre_transform,
87
+ *mosaic_affine_transform]),
88
+ *_base_.last_transform[:-1],
89
+ *text_transform
90
+ ]
91
+ train_pipeline_stage2 = [
92
+ *_base_.train_pipeline_stage2[:-1],
93
+ *text_transform
94
+ ]
95
+ coco_train_dataset = dict(
96
+ _delete_=True,
97
+ type='MultiModalDataset',
98
+ dataset=dict(
99
+ type='YOLOv5CocoDataset',
100
+ data_root='data/coco',
101
+ ann_file='annotations/instances_train2017.json',
102
+ data_prefix=dict(img='train2017/'),
103
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
104
+ class_text_path='data/texts/coco_class_texts.json',
105
+ pipeline=train_pipeline)
106
+
107
+ train_dataloader = dict(
108
+ persistent_workers=persistent_workers,
109
+ batch_size=train_batch_size_per_gpu,
110
+ collate_fn=dict(type='yolow_collate'),
111
+ dataset=coco_train_dataset)
112
+ test_pipeline = [
113
+ *_base_.test_pipeline[:-1],
114
+ dict(type='LoadText'),
115
+ dict(
116
+ type='mmdet.PackDetInputs',
117
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
118
+ 'scale_factor', 'pad_param', 'texts'))
119
+ ]
120
+ coco_val_dataset = dict(
121
+ _delete_=True,
122
+ type='MultiModalDataset',
123
+ dataset=dict(
124
+ type='YOLOv5CocoDataset',
125
+ data_root='data/coco',
126
+ ann_file='annotations/instances_val2017.json',
127
+ data_prefix=dict(img='val2017/'),
128
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
129
+ class_text_path='data/texts/coco_class_texts.json',
130
+ pipeline=test_pipeline)
131
+ val_dataloader = dict(dataset=coco_val_dataset)
132
+ test_dataloader = val_dataloader
133
+ # training settings
134
+ default_hooks = dict(
135
+ param_scheduler=dict(
136
+ scheduler_type='linear',
137
+ lr_factor=0.01,
138
+ max_epochs=max_epochs),
139
+ checkpoint=dict(
140
+ max_keep_ckpts=-1,
141
+ save_best=None,
142
+ interval=save_epoch_intervals))
143
+ custom_hooks = [
144
+ dict(
145
+ type='EMAHook',
146
+ ema_type='ExpMomentumEMA',
147
+ momentum=0.0001,
148
+ update_buffers=True,
149
+ strict_load=False,
150
+ priority=49),
151
+ dict(
152
+ type='mmdet.PipelineSwitchHook',
153
+ switch_epoch=max_epochs - close_mosaic_epochs,
154
+ switch_pipeline=train_pipeline_stage2)
155
+ ]
156
+ train_cfg = dict(
157
+ max_epochs=max_epochs,
158
+ val_interval=5,
159
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
160
+ _base_.val_interval_stage2)])
161
+ optim_wrapper = dict(
162
+ optimizer=dict(
163
+ _delete_=True,
164
+ type='AdamW',
165
+ lr=base_lr,
166
+ weight_decay=weight_decay,
167
+ batch_size_per_gpu=train_batch_size_per_gpu),
168
+ paramwise_cfg=dict(
169
+ custom_keys={'backbone.text_model': dict(lr_mult=0.01),
170
+ 'logit_scale': dict(weight_decay=0.0)}),
171
+ constructor='YOLOWv5OptimizerConstructor')
172
+
173
+ # evaluation settings
174
+ val_evaluator = dict(
175
+ _delete_=True,
176
+ type='mmdet.CocoMetric',
177
+ proposal_nums=(100, 1, 10),
178
+ ann_file='data/coco/annotations/instances_val2017.json',
179
+ metric='bbox')
YOLO-World/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_mask-refine_finetune_coco.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = (
2
+ '../../third_party/mmyolo/configs/yolov8/'
3
+ 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py')
4
+ custom_imports = dict(
5
+ imports=['yolo_world'],
6
+ allow_failed_imports=False)
7
+
8
+ # hyper-parameters
9
+ num_classes = 80
10
+ num_training_classes = 80
11
+ max_epochs = 80 # Maximum training epochs
12
+ close_mosaic_epochs = 10
13
+ save_epoch_intervals = 5
14
+ text_channels = 512
15
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
16
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
17
+ base_lr = 2e-4
18
+ weight_decay = 0.05
19
+ train_batch_size_per_gpu = 16
20
+ load_from='pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth'
21
+ persistent_workers = False
22
+
23
+ # model settings
24
+ model = dict(
25
+ type='YOLOWorldDetector',
26
+ mm_neck=True,
27
+ num_train_classes=num_training_classes,
28
+ num_test_classes=num_classes,
29
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
30
+ backbone=dict(
31
+ _delete_=True,
32
+ type='MultiModalYOLOBackbone',
33
+ image_model={{_base_.model.backbone}},
34
+ text_model=dict(
35
+ type='HuggingCLIPLanguageBackbone',
36
+ model_name='openai/clip-vit-base-patch32',
37
+ frozen_modules=['all'])),
38
+ neck=dict(type='YOLOWorldDualPAFPN',
39
+ guide_channels=text_channels,
40
+ embed_channels=neck_embed_channels,
41
+ num_heads=neck_num_heads,
42
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'),
43
+ text_enhancder=dict(type='ImagePoolingAttentionModule',
44
+ embed_channels=256,
45
+ num_heads=8)),
46
+ bbox_head=dict(type='YOLOWorldHead',
47
+ head_module=dict(type='YOLOWorldHeadModule',
48
+ embed_dims=text_channels,
49
+ num_classes=num_training_classes)),
50
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
51
+
52
+ # dataset settings
53
+ text_transform = [
54
+ dict(type='RandomLoadText',
55
+ num_neg_samples=(num_classes, num_classes),
56
+ max_num_samples=num_training_classes,
57
+ padding_to_max=True,
58
+ padding_value=''),
59
+ dict(type='mmdet.PackDetInputs',
60
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
61
+ 'flip_direction', 'texts'))
62
+ ]
63
+ mosaic_affine_transform = [
64
+ dict(
65
+ type='MultiModalMosaic',
66
+ img_scale=_base_.img_scale,
67
+ pad_val=114.0,
68
+ pre_transform=_base_.pre_transform),
69
+ dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob),
70
+ dict(
71
+ type='YOLOv5RandomAffine',
72
+ max_rotate_degree=0.0,
73
+ max_shear_degree=0.0,
74
+ max_aspect_ratio=100.,
75
+ scaling_ratio_range=(1 - _base_.affine_scale,
76
+ 1 + _base_.affine_scale),
77
+ # img_scale is (width, height)
78
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
79
+ border_val=(114, 114, 114),
80
+ min_area_ratio=_base_.min_area_ratio,
81
+ use_mask_refine=_base_.use_mask2refine)
82
+ ]
83
+ train_pipeline = [
84
+ *_base_.pre_transform,
85
+ *mosaic_affine_transform,
86
+ dict(
87
+ type='YOLOv5MultiModalMixUp',
88
+ prob=_base_.mixup_prob,
89
+ pre_transform=[*_base_.pre_transform,
90
+ *mosaic_affine_transform]),
91
+ *_base_.last_transform[:-1],
92
+ *text_transform
93
+ ]
94
+ train_pipeline_stage2 = [
95
+ *_base_.train_pipeline_stage2[:-1],
96
+ *text_transform
97
+ ]
98
+ coco_train_dataset = dict(
99
+ _delete_=True,
100
+ type='MultiModalDataset',
101
+ dataset=dict(
102
+ type='YOLOv5CocoDataset',
103
+ data_root='data/coco',
104
+ ann_file='annotations/instances_train2017.json',
105
+ data_prefix=dict(img='train2017/'),
106
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
107
+ class_text_path='data/texts/coco_class_texts.json',
108
+ pipeline=train_pipeline)
109
+
110
+ train_dataloader = dict(
111
+ persistent_workers=persistent_workers,
112
+ batch_size=train_batch_size_per_gpu,
113
+ collate_fn=dict(type='yolow_collate'),
114
+ dataset=coco_train_dataset)
115
+ test_pipeline = [
116
+ *_base_.test_pipeline[:-1],
117
+ dict(type='LoadText'),
118
+ dict(
119
+ type='mmdet.PackDetInputs',
120
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
121
+ 'scale_factor', 'pad_param', 'texts'))
122
+ ]
123
+ coco_val_dataset = dict(
124
+ _delete_=True,
125
+ type='MultiModalDataset',
126
+ dataset=dict(
127
+ type='YOLOv5CocoDataset',
128
+ data_root='data/coco',
129
+ ann_file='annotations/instances_val2017.json',
130
+ data_prefix=dict(img='val2017/'),
131
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
132
+ class_text_path='data/texts/coco_class_texts.json',
133
+ pipeline=test_pipeline)
134
+ val_dataloader = dict(dataset=coco_val_dataset)
135
+ test_dataloader = val_dataloader
136
+ # training settings
137
+ default_hooks = dict(
138
+ param_scheduler=dict(
139
+ scheduler_type='linear',
140
+ lr_factor=0.01,
141
+ max_epochs=max_epochs),
142
+ checkpoint=dict(
143
+ max_keep_ckpts=-1,
144
+ save_best=None,
145
+ interval=save_epoch_intervals))
146
+ custom_hooks = [
147
+ dict(
148
+ type='EMAHook',
149
+ ema_type='ExpMomentumEMA',
150
+ momentum=0.0001,
151
+ update_buffers=True,
152
+ strict_load=False,
153
+ priority=49),
154
+ dict(
155
+ type='mmdet.PipelineSwitchHook',
156
+ switch_epoch=max_epochs - close_mosaic_epochs,
157
+ switch_pipeline=train_pipeline_stage2)
158
+ ]
159
+ train_cfg = dict(
160
+ max_epochs=max_epochs,
161
+ val_interval=5,
162
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
163
+ _base_.val_interval_stage2)])
164
+ optim_wrapper = dict(
165
+ optimizer=dict(
166
+ _delete_=True,
167
+ type='AdamW',
168
+ lr=base_lr,
169
+ weight_decay=weight_decay,
170
+ batch_size_per_gpu=train_batch_size_per_gpu),
171
+ paramwise_cfg=dict(
172
+ custom_keys={'backbone.text_model': dict(lr_mult=0.01),
173
+ 'logit_scale': dict(weight_decay=0.0)}),
174
+ constructor='YOLOWv5OptimizerConstructor')
175
+ # evaluation settings
176
+ val_evaluator = dict(
177
+ _delete_=True,
178
+ type='mmdet.CocoMetric',
179
+ proposal_nums=(100, 1, 10),
180
+ ann_file='data/coco/annotations/instances_val2017.json',
181
+ metric='bbox')
YOLO-World/configs/finetune_coco/yolo_world_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
4
+
5
+ # hyper-parameters
6
+ num_classes = 80
7
+ num_training_classes = 80
8
+ max_epochs = 80 # Maximum training epochs
9
+ close_mosaic_epochs = 10
10
+ save_epoch_intervals = 5
11
+ text_channels = 512
12
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
13
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
14
+ base_lr = 2e-4
15
+ weight_decay = 0.05
16
+ train_batch_size_per_gpu = 16
17
+ load_from = 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth'
18
+ # huggingface text model
19
+ text_model_name = 'openai/clip-vit-base-patch32'
20
+ persistent_workers = False
21
+
22
+ # model settings
23
+ model = dict(
24
+ type='YOLOWorldDetector',
25
+ mm_neck=True,
26
+ num_train_classes=num_training_classes,
27
+ num_test_classes=num_classes,
28
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
29
+ backbone=dict(
30
+ _delete_=True,
31
+ type='MultiModalYOLOBackbone',
32
+ image_model={{_base_.model.backbone}},
33
+ text_model=dict(
34
+ type='HuggingCLIPLanguageBackbone',
35
+ model_name=text_model_name,
36
+ frozen_modules=['all'])),
37
+ neck=dict(type='YOLOWorldPAFPN',
38
+ guide_channels=text_channels,
39
+ embed_channels=neck_embed_channels,
40
+ num_heads=neck_num_heads,
41
+ block_cfg=dict(type='EfficientCSPLayerWithTwoConv')),
42
+ bbox_head=dict(type='YOLOWorldHead',
43
+ head_module=dict(type='YOLOWorldHeadModule',
44
+ embed_dims=text_channels,
45
+ num_classes=num_training_classes)),
46
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
47
+
48
+ # dataset settings
49
+ text_transform = [
50
+ dict(type='RandomLoadText',
51
+ num_neg_samples=(num_classes, num_classes),
52
+ max_num_samples=num_training_classes,
53
+ padding_to_max=True,
54
+ padding_value=''),
55
+ dict(type='mmdet.PackDetInputs',
56
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
57
+ 'flip_direction', 'texts'))
58
+ ]
59
+ mosaic_affine_transform = [
60
+ dict(type='MultiModalMosaic',
61
+ img_scale=_base_.img_scale,
62
+ pad_val=114.0,
63
+ pre_transform=_base_.pre_transform),
64
+ dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob),
65
+ dict(
66
+ type='YOLOv5RandomAffine',
67
+ max_rotate_degree=0.0,
68
+ max_shear_degree=0.0,
69
+ max_aspect_ratio=100.,
70
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
71
+ # img_scale is (width, height)
72
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
73
+ border_val=(114, 114, 114),
74
+ min_area_ratio=_base_.min_area_ratio,
75
+ use_mask_refine=_base_.use_mask2refine)
76
+ ]
77
+ train_pipeline = [
78
+ *_base_.pre_transform, *mosaic_affine_transform,
79
+ dict(type='YOLOv5MultiModalMixUp',
80
+ prob=_base_.mixup_prob,
81
+ pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]),
82
+ *_base_.last_transform[:-1], *text_transform
83
+ ]
84
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
85
+ coco_train_dataset = dict(_delete_=True,
86
+ type='MultiModalDataset',
87
+ dataset=dict(
88
+ type='YOLOv5CocoDataset',
89
+ data_root='data/coco',
90
+ ann_file='annotations/instances_train2017.json',
91
+ data_prefix=dict(img='train2017/'),
92
+ filter_cfg=dict(filter_empty_gt=False,
93
+ min_size=32)),
94
+ class_text_path='data/texts/coco_class_texts.json',
95
+ pipeline=train_pipeline)
96
+
97
+ train_dataloader = dict(persistent_workers=persistent_workers,
98
+ batch_size=train_batch_size_per_gpu,
99
+ collate_fn=dict(type='yolow_collate'),
100
+ dataset=coco_train_dataset)
101
+ test_pipeline = [
102
+ *_base_.test_pipeline[:-1],
103
+ dict(type='LoadText'),
104
+ dict(type='mmdet.PackDetInputs',
105
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
106
+ 'scale_factor', 'pad_param', 'texts'))
107
+ ]
108
+ coco_val_dataset = dict(
109
+ _delete_=True,
110
+ type='MultiModalDataset',
111
+ dataset=dict(type='YOLOv5CocoDataset',
112
+ data_root='data/coco',
113
+ ann_file='annotations/instances_val2017.json',
114
+ data_prefix=dict(img='val2017/'),
115
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
116
+ class_text_path='data/texts/coco_class_texts.json',
117
+ pipeline=test_pipeline)
118
+ val_dataloader = dict(dataset=coco_val_dataset)
119
+ test_dataloader = val_dataloader
120
+ # training settings
121
+ default_hooks = dict(param_scheduler=dict(scheduler_type='linear',
122
+ lr_factor=0.01,
123
+ max_epochs=max_epochs),
124
+ checkpoint=dict(max_keep_ckpts=-1,
125
+ save_best=None,
126
+ interval=save_epoch_intervals))
127
+ custom_hooks = [
128
+ dict(type='EMAHook',
129
+ ema_type='ExpMomentumEMA',
130
+ momentum=0.0001,
131
+ update_buffers=True,
132
+ strict_load=False,
133
+ priority=49),
134
+ dict(type='mmdet.PipelineSwitchHook',
135
+ switch_epoch=max_epochs - close_mosaic_epochs,
136
+ switch_pipeline=train_pipeline_stage2)
137
+ ]
138
+ train_cfg = dict(max_epochs=max_epochs,
139
+ val_interval=5,
140
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
141
+ _base_.val_interval_stage2)])
142
+ optim_wrapper = dict(
143
+ optimizer=dict(
144
+ _delete_=True,
145
+ type='AdamW',
146
+ lr=base_lr,
147
+ weight_decay=weight_decay,
148
+ batch_size_per_gpu=train_batch_size_per_gpu),
149
+ paramwise_cfg=dict(
150
+ custom_keys={'backbone.text_model': dict(lr_mult=0.01),
151
+ 'logit_scale': dict(weight_decay=0.0)}),
152
+ constructor='YOLOWv5OptimizerConstructor')
153
+
154
+ # evaluation settings
155
+ val_evaluator = dict(_delete_=True,
156
+ type='mmdet.CocoMetric',
157
+ proposal_nums=(100, 1, 10),
158
+ ann_file='data/coco/annotations/instances_val2017.json',
159
+ metric='bbox')
YOLO-World/configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = (
2
+ '../../third_party/mmyolo/configs/yolov8/'
3
+ 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py')
4
+ custom_imports = dict(
5
+ imports=['yolo_world'],
6
+ allow_failed_imports=False)
7
+
8
+ # hyper-parameters
9
+ num_classes = 80
10
+ num_training_classes = 80
11
+ max_epochs = 80 # Maximum training epochs
12
+ close_mosaic_epochs = 10
13
+ save_epoch_intervals = 5
14
+ text_channels = 512
15
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
16
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
17
+ base_lr = 2e-4
18
+ weight_decay = 0.05
19
+ train_batch_size_per_gpu = 16
20
+ load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth'
21
+ text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
22
+ text_model_name = 'openai/clip-vit-base-patch32'
23
+ persistent_workers = False
24
+
25
+ # model settings
26
+ model = dict(
27
+ type='YOLOWorldDetector',
28
+ mm_neck=True,
29
+ num_train_classes=num_training_classes,
30
+ num_test_classes=num_classes,
31
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
32
+ backbone=dict(
33
+ _delete_=True,
34
+ type='MultiModalYOLOBackbone',
35
+ image_model={{_base_.model.backbone}},
36
+ text_model=dict(
37
+ type='HuggingCLIPLanguageBackbone',
38
+ model_name=text_model_name,
39
+ frozen_modules=['all'])),
40
+ neck=dict(type='YOLOWorldPAFPN',
41
+ guide_channels=text_channels,
42
+ embed_channels=neck_embed_channels,
43
+ num_heads=neck_num_heads,
44
+ block_cfg=dict(type='EfficientCSPLayerWithTwoConv')),
45
+ bbox_head=dict(type='YOLOWorldHead',
46
+ head_module=dict(type='YOLOWorldHeadModule',
47
+ use_bn_head=True,
48
+ embed_dims=text_channels,
49
+ num_classes=num_training_classes)),
50
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
51
+
52
+ # dataset settings
53
+ text_transform = [
54
+ dict(type='RandomLoadText',
55
+ num_neg_samples=(num_classes, num_classes),
56
+ max_num_samples=num_training_classes,
57
+ padding_to_max=True,
58
+ padding_value=''),
59
+ dict(type='mmdet.PackDetInputs',
60
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
61
+ 'flip_direction', 'texts'))
62
+ ]
63
+ mosaic_affine_transform = [
64
+ dict(
65
+ type='MultiModalMosaic',
66
+ img_scale=_base_.img_scale,
67
+ pad_val=114.0,
68
+ pre_transform=_base_.pre_transform),
69
+ dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob),
70
+ dict(
71
+ type='YOLOv5RandomAffine',
72
+ max_rotate_degree=0.0,
73
+ max_shear_degree=0.0,
74
+ max_aspect_ratio=100.,
75
+ scaling_ratio_range=(1 - _base_.affine_scale,
76
+ 1 + _base_.affine_scale),
77
+ # img_scale is (width, height)
78
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
79
+ border_val=(114, 114, 114),
80
+ min_area_ratio=_base_.min_area_ratio,
81
+ use_mask_refine=_base_.use_mask2refine)
82
+ ]
83
+ train_pipeline = [
84
+ *_base_.pre_transform,
85
+ *mosaic_affine_transform,
86
+ dict(
87
+ type='YOLOv5MultiModalMixUp',
88
+ prob=_base_.mixup_prob,
89
+ pre_transform=[*_base_.pre_transform,
90
+ *mosaic_affine_transform]),
91
+ *_base_.last_transform[:-1],
92
+ *text_transform
93
+ ]
94
+ train_pipeline_stage2 = [
95
+ *_base_.train_pipeline_stage2[:-1],
96
+ *text_transform
97
+ ]
98
+ coco_train_dataset = dict(
99
+ _delete_=True,
100
+ type='MultiModalDataset',
101
+ dataset=dict(
102
+ type='YOLOv5CocoDataset',
103
+ data_root='data/coco',
104
+ ann_file='annotations/instances_train2017.json',
105
+ data_prefix=dict(img='train2017/'),
106
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
107
+ class_text_path='data/texts/coco_class_texts.json',
108
+ pipeline=train_pipeline)
109
+
110
+ train_dataloader = dict(
111
+ persistent_workers=persistent_workers,
112
+ batch_size=train_batch_size_per_gpu,
113
+ collate_fn=dict(type='yolow_collate'),
114
+ dataset=coco_train_dataset)
115
+ test_pipeline = [
116
+ *_base_.test_pipeline[:-1],
117
+ dict(type='LoadText'),
118
+ dict(
119
+ type='mmdet.PackDetInputs',
120
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
121
+ 'scale_factor', 'pad_param', 'texts'))
122
+ ]
123
+ coco_val_dataset = dict(
124
+ _delete_=True,
125
+ type='MultiModalDataset',
126
+ dataset=dict(
127
+ type='YOLOv5CocoDataset',
128
+ data_root='data/coco',
129
+ ann_file='annotations/instances_val2017.json',
130
+ data_prefix=dict(img='val2017/'),
131
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
132
+ class_text_path='data/texts/coco_class_texts.json',
133
+ pipeline=test_pipeline)
134
+ val_dataloader = dict(dataset=coco_val_dataset)
135
+ test_dataloader = val_dataloader
136
+ # training settings
137
+ default_hooks = dict(
138
+ param_scheduler=dict(
139
+ scheduler_type='linear',
140
+ lr_factor=0.01,
141
+ max_epochs=max_epochs),
142
+ checkpoint=dict(
143
+ max_keep_ckpts=-1,
144
+ save_best=None,
145
+ interval=save_epoch_intervals))
146
+ custom_hooks = [
147
+ dict(
148
+ type='EMAHook',
149
+ ema_type='ExpMomentumEMA',
150
+ momentum=0.0001,
151
+ update_buffers=True,
152
+ strict_load=False,
153
+ priority=49),
154
+ dict(
155
+ type='mmdet.PipelineSwitchHook',
156
+ switch_epoch=max_epochs - close_mosaic_epochs,
157
+ switch_pipeline=train_pipeline_stage2)
158
+ ]
159
+ train_cfg = dict(
160
+ max_epochs=max_epochs,
161
+ val_interval=5,
162
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
163
+ _base_.val_interval_stage2)])
164
+ optim_wrapper = dict(
165
+ optimizer=dict(
166
+ _delete_=True,
167
+ type='AdamW',
168
+ lr=base_lr,
169
+ weight_decay=weight_decay,
170
+ batch_size_per_gpu=train_batch_size_per_gpu),
171
+ paramwise_cfg=dict(
172
+ custom_keys={'backbone.text_model': dict(lr_mult=0.01),
173
+ 'logit_scale': dict(weight_decay=0.0)}),
174
+ constructor='YOLOWv5OptimizerConstructor')
175
+
176
+ # evaluation settings
177
+ val_evaluator = dict(
178
+ _delete_=True,
179
+ type='mmdet.CocoMetric',
180
+ proposal_nums=(100, 1, 10),
181
+ ann_file='data/coco/annotations/instances_val2017.json',
182
+ metric='bbox')
YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = (
2
+ '../../third_party/mmyolo/configs/yolov8/'
3
+ 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py')
4
+ custom_imports = dict(
5
+ imports=['yolo_world'],
6
+ allow_failed_imports=False)
7
+
8
+ # hyper-parameters
9
+ num_classes = 80
10
+ num_training_classes = 80
11
+ max_epochs = 80 # Maximum training epochs
12
+ close_mosaic_epochs = 10
13
+ save_epoch_intervals = 5
14
+ text_channels = 512
15
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
16
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
17
+ base_lr = 2e-4
18
+ weight_decay = 0.05
19
+ train_batch_size_per_gpu = 16
20
+ load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth'
21
+ text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
22
+ text_model_name = 'openai/clip-vit-base-patch32'
23
+ persistent_workers = False
24
+
25
+ # model settings
26
+ model = dict(
27
+ type='YOLOWorldDetector',
28
+ mm_neck=True,
29
+ num_train_classes=num_training_classes,
30
+ num_test_classes=num_classes,
31
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
32
+ backbone=dict(
33
+ _delete_=True,
34
+ type='MultiModalYOLOBackbone',
35
+ image_model={{_base_.model.backbone}},
36
+ text_model=dict(
37
+ type='HuggingCLIPLanguageBackbone',
38
+ model_name=text_model_name,
39
+ frozen_modules=['all'])),
40
+ neck=dict(type='YOLOWorldPAFPN',
41
+ guide_channels=text_channels,
42
+ embed_channels=neck_embed_channels,
43
+ num_heads=neck_num_heads,
44
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
45
+ bbox_head=dict(type='YOLOWorldHead',
46
+ head_module=dict(type='YOLOWorldHeadModule',
47
+ use_bn_head=True,
48
+ embed_dims=text_channels,
49
+ num_classes=num_training_classes)),
50
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
51
+
52
+ # dataset settings
53
+ text_transform = [
54
+ dict(type='RandomLoadText',
55
+ num_neg_samples=(num_classes, num_classes),
56
+ max_num_samples=num_training_classes,
57
+ padding_to_max=True,
58
+ padding_value=''),
59
+ dict(type='mmdet.PackDetInputs',
60
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
61
+ 'flip_direction', 'texts'))
62
+ ]
63
+ mosaic_affine_transform = [
64
+ dict(
65
+ type='MultiModalMosaic',
66
+ img_scale=_base_.img_scale,
67
+ pad_val=114.0,
68
+ pre_transform=_base_.pre_transform),
69
+ dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob),
70
+ dict(
71
+ type='YOLOv5RandomAffine',
72
+ max_rotate_degree=0.0,
73
+ max_shear_degree=0.0,
74
+ max_aspect_ratio=100.,
75
+ scaling_ratio_range=(1 - _base_.affine_scale,
76
+ 1 + _base_.affine_scale),
77
+ # img_scale is (width, height)
78
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
79
+ border_val=(114, 114, 114),
80
+ min_area_ratio=_base_.min_area_ratio,
81
+ use_mask_refine=_base_.use_mask2refine)
82
+ ]
83
+ train_pipeline = [
84
+ *_base_.pre_transform,
85
+ *mosaic_affine_transform,
86
+ dict(
87
+ type='YOLOv5MultiModalMixUp',
88
+ prob=_base_.mixup_prob,
89
+ pre_transform=[*_base_.pre_transform,
90
+ *mosaic_affine_transform]),
91
+ *_base_.last_transform[:-1],
92
+ *text_transform
93
+ ]
94
+ train_pipeline_stage2 = [
95
+ *_base_.train_pipeline_stage2[:-1],
96
+ *text_transform
97
+ ]
98
+ coco_train_dataset = dict(
99
+ _delete_=True,
100
+ type='MultiModalDataset',
101
+ dataset=dict(
102
+ type='YOLOv5CocoDataset',
103
+ data_root='data/coco',
104
+ ann_file='annotations/instances_train2017.json',
105
+ data_prefix=dict(img='train2017/'),
106
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
107
+ class_text_path='data/texts/coco_class_texts.json',
108
+ pipeline=train_pipeline)
109
+
110
+ train_dataloader = dict(
111
+ persistent_workers=persistent_workers,
112
+ batch_size=train_batch_size_per_gpu,
113
+ collate_fn=dict(type='yolow_collate'),
114
+ dataset=coco_train_dataset)
115
+ test_pipeline = [
116
+ *_base_.test_pipeline[:-1],
117
+ dict(type='LoadText'),
118
+ dict(
119
+ type='mmdet.PackDetInputs',
120
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
121
+ 'scale_factor', 'pad_param', 'texts'))
122
+ ]
123
+ coco_val_dataset = dict(
124
+ _delete_=True,
125
+ type='MultiModalDataset',
126
+ dataset=dict(
127
+ type='YOLOv5CocoDataset',
128
+ data_root='data/coco',
129
+ ann_file='annotations/instances_val2017.json',
130
+ data_prefix=dict(img='val2017/'),
131
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
132
+ class_text_path='data/texts/coco_class_texts.json',
133
+ pipeline=test_pipeline)
134
+ val_dataloader = dict(dataset=coco_val_dataset)
135
+ test_dataloader = val_dataloader
136
+ # training settings
137
+ default_hooks = dict(
138
+ param_scheduler=dict(
139
+ scheduler_type='linear',
140
+ lr_factor=0.01,
141
+ max_epochs=max_epochs),
142
+ checkpoint=dict(
143
+ max_keep_ckpts=-1,
144
+ save_best=None,
145
+ interval=save_epoch_intervals))
146
+ custom_hooks = [
147
+ dict(
148
+ type='EMAHook',
149
+ ema_type='ExpMomentumEMA',
150
+ momentum=0.0001,
151
+ update_buffers=True,
152
+ strict_load=False,
153
+ priority=49),
154
+ dict(
155
+ type='mmdet.PipelineSwitchHook',
156
+ switch_epoch=max_epochs - close_mosaic_epochs,
157
+ switch_pipeline=train_pipeline_stage2)
158
+ ]
159
+ train_cfg = dict(
160
+ max_epochs=max_epochs,
161
+ val_interval=5,
162
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
163
+ _base_.val_interval_stage2)])
164
+ optim_wrapper = dict(
165
+ optimizer=dict(
166
+ _delete_=True,
167
+ type='AdamW',
168
+ lr=base_lr,
169
+ weight_decay=weight_decay,
170
+ batch_size_per_gpu=train_batch_size_per_gpu),
171
+ paramwise_cfg=dict(
172
+ custom_keys={'backbone.text_model': dict(lr_mult=0.01),
173
+ 'logit_scale': dict(weight_decay=0.0)}),
174
+ constructor='YOLOWv5OptimizerConstructor')
175
+ # evaluation settings
176
+ val_evaluator = dict(
177
+ _delete_=True,
178
+ type='mmdet.CocoMetric',
179
+ proposal_nums=(100, 1, 10),
180
+ ann_file='data/coco/annotations/instances_val2017.json',
181
+ metric='bbox')
YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_l_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
4
+
5
+ # hyper-parameters
6
+ num_classes = 80
7
+ num_training_classes = 80
8
+ max_epochs = 40 # Maximum training epochs
9
+ close_mosaic_epochs = 30
10
+ save_epoch_intervals = 5
11
+ text_channels = 512
12
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
13
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
14
+ base_lr = 1e-3
15
+ weight_decay = 0.0005
16
+ train_batch_size_per_gpu = 16
17
+ load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth'
18
+ text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
19
+ text_model_name = 'openai/clip-vit-base-patch32'
20
+ persistent_workers = False
21
+
22
+ # model settings
23
+ model = dict(type='YOLOWorldDetector',
24
+ mm_neck=True,
25
+ num_train_classes=num_training_classes,
26
+ num_test_classes=num_classes,
27
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
28
+ backbone=dict(_delete_=True,
29
+ type='MultiModalYOLOBackbone',
30
+ image_model={{_base_.model.backbone}},
31
+ text_model=dict(type='HuggingCLIPLanguageBackbone',
32
+ model_name=text_model_name,
33
+ frozen_modules=['all'])),
34
+ neck=dict(type='YOLOWorldPAFPN',
35
+ guide_channels=text_channels,
36
+ embed_channels=neck_embed_channels,
37
+ num_heads=neck_num_heads,
38
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
39
+ bbox_head=dict(type='YOLOWorldHead',
40
+ head_module=dict(
41
+ type='YOLOWorldHeadModule',
42
+ use_bn_head=True,
43
+ embed_dims=text_channels,
44
+ num_classes=num_training_classes)),
45
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
46
+
47
+ # dataset settings
48
+ text_transform = [
49
+ dict(type='RandomLoadText',
50
+ num_neg_samples=(num_classes, num_classes),
51
+ max_num_samples=num_training_classes,
52
+ padding_to_max=True,
53
+ padding_value=''),
54
+ dict(type='mmdet.PackDetInputs',
55
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
56
+ 'flip_direction', 'texts'))
57
+ ]
58
+ mosaic_affine_transform = [
59
+ dict(type='MultiModalMosaic',
60
+ img_scale=_base_.img_scale,
61
+ pad_val=114.0,
62
+ pre_transform=_base_.pre_transform),
63
+ dict(
64
+ type='YOLOv5RandomAffine',
65
+ max_rotate_degree=0.0,
66
+ max_shear_degree=0.0,
67
+ max_aspect_ratio=100.,
68
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
69
+ # img_scale is (width, height)
70
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
71
+ border_val=(114, 114, 114))
72
+ ]
73
+
74
+ train_pipeline = [
75
+ *_base_.pre_transform, *mosaic_affine_transform,
76
+ dict(type='YOLOv5MultiModalMixUp',
77
+ prob=_base_.mixup_prob,
78
+ pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]),
79
+ *_base_.last_transform[:-1], *text_transform
80
+ ]
81
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
82
+
83
+ coco_train_dataset = dict(_delete_=True,
84
+ type='MultiModalDataset',
85
+ dataset=dict(
86
+ type='YOLOv5CocoDataset',
87
+ data_root='data/coco',
88
+ ann_file='annotations/instances_train2017.json',
89
+ data_prefix=dict(img='train2017/'),
90
+ filter_cfg=dict(filter_empty_gt=False,
91
+ min_size=32)),
92
+ class_text_path='data/texts/coco_class_texts.json',
93
+ pipeline=train_pipeline)
94
+
95
+ train_dataloader = dict(persistent_workers=persistent_workers,
96
+ batch_size=train_batch_size_per_gpu,
97
+ collate_fn=dict(type='yolow_collate'),
98
+ dataset=coco_train_dataset)
99
+ test_pipeline = [
100
+ *_base_.test_pipeline[:-1],
101
+ dict(type='LoadText'),
102
+ dict(type='mmdet.PackDetInputs',
103
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
104
+ 'scale_factor', 'pad_param', 'texts'))
105
+ ]
106
+ coco_val_dataset = dict(
107
+ _delete_=True,
108
+ type='MultiModalDataset',
109
+ dataset=dict(type='YOLOv5CocoDataset',
110
+ data_root='data/coco',
111
+ ann_file='annotations/instances_val2017.json',
112
+ data_prefix=dict(img='val2017/'),
113
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
114
+ class_text_path='data/texts/coco_class_texts.json',
115
+ pipeline=test_pipeline)
116
+ val_dataloader = dict(dataset=coco_val_dataset)
117
+ test_dataloader = val_dataloader
118
+ # training settings
119
+ default_hooks = dict(param_scheduler=dict(scheduler_type='linear',
120
+ lr_factor=0.01,
121
+ max_epochs=max_epochs),
122
+ checkpoint=dict(max_keep_ckpts=-1,
123
+ save_best=None,
124
+ interval=save_epoch_intervals))
125
+ custom_hooks = [
126
+ dict(type='EMAHook',
127
+ ema_type='ExpMomentumEMA',
128
+ momentum=0.0001,
129
+ update_buffers=True,
130
+ strict_load=False,
131
+ priority=49),
132
+ dict(type='mmdet.PipelineSwitchHook',
133
+ switch_epoch=max_epochs - close_mosaic_epochs,
134
+ switch_pipeline=train_pipeline_stage2)
135
+ ]
136
+ train_cfg = dict(max_epochs=max_epochs,
137
+ val_interval=5,
138
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
139
+ _base_.val_interval_stage2)])
140
+ optim_wrapper = dict(optimizer=dict(
141
+ _delete_=True,
142
+ type='SGD',
143
+ lr=base_lr,
144
+ momentum=0.937,
145
+ nesterov=True,
146
+ weight_decay=weight_decay,
147
+ batch_size_per_gpu=train_batch_size_per_gpu),
148
+ paramwise_cfg=dict(
149
+ custom_keys={
150
+ 'backbone.text_model': dict(lr_mult=0.01),
151
+ 'logit_scale': dict(weight_decay=0.0)
152
+ }),
153
+ constructor='YOLOWv5OptimizerConstructor')
154
+
155
+ # evaluation settings
156
+ val_evaluator = dict(_delete_=True,
157
+ type='mmdet.CocoMetric',
158
+ proposal_nums=(100, 1, 10),
159
+ ann_file='data/coco/annotations/instances_val2017.json',
160
+ metric='bbox')
YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_mask-refine_finetune_coco.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
4
+
5
+ # hyper-parameters
6
+ num_classes = 80
7
+ num_training_classes = 80
8
+ max_epochs = 80 # Maximum training epochs
9
+ close_mosaic_epochs = 10
10
+ save_epoch_intervals = 5
11
+ text_channels = 512
12
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
13
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
14
+ base_lr = 1e-3
15
+ weight_decay = 0.0005
16
+ train_batch_size_per_gpu = 16
17
+ load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth'
18
+ text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
19
+ text_model_name = 'openai/clip-vit-base-patch32'
20
+ persistent_workers = False
21
+
22
+ # model settings
23
+ model = dict(type='YOLOWorldDetector',
24
+ mm_neck=True,
25
+ num_train_classes=num_training_classes,
26
+ num_test_classes=num_classes,
27
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
28
+ backbone=dict(_delete_=True,
29
+ type='MultiModalYOLOBackbone',
30
+ image_model={{_base_.model.backbone}},
31
+ text_model=dict(type='HuggingCLIPLanguageBackbone',
32
+ model_name=text_model_name,
33
+ frozen_modules=['all'])),
34
+ neck=dict(type='YOLOWorldPAFPN',
35
+ guide_channels=text_channels,
36
+ embed_channels=neck_embed_channels,
37
+ num_heads=neck_num_heads,
38
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
39
+ bbox_head=dict(type='YOLOWorldHead',
40
+ head_module=dict(
41
+ type='YOLOWorldHeadModule',
42
+ use_bn_head=True,
43
+ embed_dims=text_channels,
44
+ num_classes=num_training_classes)),
45
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
46
+
47
+ # dataset settings
48
+ text_transform = [
49
+ dict(type='RandomLoadText',
50
+ num_neg_samples=(num_classes, num_classes),
51
+ max_num_samples=num_training_classes,
52
+ padding_to_max=True,
53
+ padding_value=''),
54
+ dict(type='mmdet.PackDetInputs',
55
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
56
+ 'flip_direction', 'texts'))
57
+ ]
58
+ mosaic_affine_transform = [
59
+ dict(type='MultiModalMosaic',
60
+ img_scale=_base_.img_scale,
61
+ pad_val=114.0,
62
+ pre_transform=_base_.pre_transform),
63
+ dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob),
64
+ dict(
65
+ type='YOLOv5RandomAffine',
66
+ max_rotate_degree=0.0,
67
+ max_shear_degree=0.0,
68
+ max_aspect_ratio=100.,
69
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
70
+ # img_scale is (width, height)
71
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
72
+ border_val=(114, 114, 114),
73
+ min_area_ratio=_base_.min_area_ratio,
74
+ use_mask_refine=_base_.use_mask2refine)
75
+ ]
76
+ train_pipeline = [
77
+ *_base_.pre_transform, *mosaic_affine_transform,
78
+ dict(type='YOLOv5MultiModalMixUp',
79
+ prob=_base_.mixup_prob,
80
+ pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]),
81
+ *_base_.last_transform[:-1], *text_transform
82
+ ]
83
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
84
+ coco_train_dataset = dict(_delete_=True,
85
+ type='MultiModalDataset',
86
+ dataset=dict(
87
+ type='YOLOv5CocoDataset',
88
+ data_root='data/coco',
89
+ ann_file='annotations/instances_train2017.json',
90
+ data_prefix=dict(img='train2017/'),
91
+ filter_cfg=dict(filter_empty_gt=False,
92
+ min_size=32)),
93
+ class_text_path='data/texts/coco_class_texts.json',
94
+ pipeline=train_pipeline)
95
+
96
+ train_dataloader = dict(persistent_workers=persistent_workers,
97
+ batch_size=train_batch_size_per_gpu,
98
+ collate_fn=dict(type='yolow_collate'),
99
+ dataset=coco_train_dataset)
100
+ test_pipeline = [
101
+ *_base_.test_pipeline[:-1],
102
+ dict(type='LoadText'),
103
+ dict(type='mmdet.PackDetInputs',
104
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
105
+ 'scale_factor', 'pad_param', 'texts'))
106
+ ]
107
+ coco_val_dataset = dict(
108
+ _delete_=True,
109
+ type='MultiModalDataset',
110
+ dataset=dict(type='YOLOv5CocoDataset',
111
+ data_root='data/coco',
112
+ ann_file='annotations/instances_val2017.json',
113
+ data_prefix=dict(img='val2017/'),
114
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
115
+ class_text_path='data/texts/coco_class_texts.json',
116
+ pipeline=test_pipeline)
117
+ val_dataloader = dict(dataset=coco_val_dataset)
118
+ test_dataloader = val_dataloader
119
+ # training settings
120
+ default_hooks = dict(param_scheduler=dict(scheduler_type='linear',
121
+ lr_factor=0.01,
122
+ max_epochs=max_epochs),
123
+ checkpoint=dict(max_keep_ckpts=-1,
124
+ save_best=None,
125
+ interval=save_epoch_intervals))
126
+ custom_hooks = [
127
+ dict(type='EMAHook',
128
+ ema_type='ExpMomentumEMA',
129
+ momentum=0.0001,
130
+ update_buffers=True,
131
+ strict_load=False,
132
+ priority=49),
133
+ dict(type='mmdet.PipelineSwitchHook',
134
+ switch_epoch=max_epochs - close_mosaic_epochs,
135
+ switch_pipeline=train_pipeline_stage2)
136
+ ]
137
+ train_cfg = dict(max_epochs=max_epochs,
138
+ val_interval=5,
139
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
140
+ _base_.val_interval_stage2)])
141
+ optim_wrapper = dict(optimizer=dict(
142
+ _delete_=True,
143
+ type='SGD',
144
+ lr=base_lr,
145
+ momentum=0.937,
146
+ nesterov=True,
147
+ weight_decay=weight_decay,
148
+ batch_size_per_gpu=train_batch_size_per_gpu),
149
+ paramwise_cfg=dict(
150
+ custom_keys={
151
+ 'backbone.text_model': dict(lr_mult=0.01),
152
+ 'logit_scale': dict(weight_decay=0.0)
153
+ }),
154
+ constructor='YOLOWv5OptimizerConstructor')
155
+
156
+ # evaluation settings
157
+ val_evaluator = dict(_delete_=True,
158
+ type='mmdet.CocoMetric',
159
+ proposal_nums=(100, 1, 10),
160
+ ann_file='data/coco/annotations/instances_val2017.json',
161
+ metric='bbox')
YOLO-World/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = (
2
+ '../../third_party/mmyolo/configs/yolov8/'
3
+ 'yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py')
4
+ custom_imports = dict(
5
+ imports=['yolo_world'],
6
+ allow_failed_imports=False)
7
+
8
+ # hyper-parameters
9
+ num_classes = 80
10
+ num_training_classes = 80
11
+ max_epochs = 80 # Maximum training epochs
12
+ close_mosaic_epochs = 10
13
+ save_epoch_intervals = 5
14
+ text_channels = 512
15
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
16
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
17
+ base_lr = 2e-4
18
+ weight_decay = 0.05
19
+ train_batch_size_per_gpu = 16
20
+ load_from = 'pretrained_models/yolo_world_m_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-c6237d5b.pth'
21
+ # text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
22
+ text_model_name = 'openai/clip-vit-base-patch32'
23
+ persistent_workers = False
24
+
25
+ # model settings
26
+ model = dict(
27
+ type='YOLOWorldDetector',
28
+ mm_neck=True,
29
+ num_train_classes=num_training_classes,
30
+ num_test_classes=num_classes,
31
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
32
+ backbone=dict(
33
+ _delete_=True,
34
+ type='MultiModalYOLOBackbone',
35
+ image_model={{_base_.model.backbone}},
36
+ text_model=dict(
37
+ type='HuggingCLIPLanguageBackbone',
38
+ model_name=text_model_name,
39
+ frozen_modules=['all'])),
40
+ neck=dict(type='YOLOWorldPAFPN',
41
+ guide_channels=text_channels,
42
+ embed_channels=neck_embed_channels,
43
+ num_heads=neck_num_heads,
44
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
45
+ bbox_head=dict(type='YOLOWorldHead',
46
+ head_module=dict(type='YOLOWorldHeadModule',
47
+ use_bn_head=True,
48
+ embed_dims=text_channels,
49
+ num_classes=num_training_classes)),
50
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
51
+
52
+ # dataset settings
53
+ text_transform = [
54
+ dict(type='RandomLoadText',
55
+ num_neg_samples=(num_classes, num_classes),
56
+ max_num_samples=num_training_classes,
57
+ padding_to_max=True,
58
+ padding_value=''),
59
+ dict(type='mmdet.PackDetInputs',
60
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
61
+ 'flip_direction', 'texts'))
62
+ ]
63
+ mosaic_affine_transform = [
64
+ dict(
65
+ type='MultiModalMosaic',
66
+ img_scale=_base_.img_scale,
67
+ pad_val=114.0,
68
+ pre_transform=_base_.pre_transform),
69
+ dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob),
70
+ dict(
71
+ type='YOLOv5RandomAffine',
72
+ max_rotate_degree=0.0,
73
+ max_shear_degree=0.0,
74
+ max_aspect_ratio=100.,
75
+ scaling_ratio_range=(1 - _base_.affine_scale,
76
+ 1 + _base_.affine_scale),
77
+ # img_scale is (width, height)
78
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
79
+ border_val=(114, 114, 114),
80
+ min_area_ratio=_base_.min_area_ratio,
81
+ use_mask_refine=_base_.use_mask2refine)
82
+ ]
83
+ train_pipeline = [
84
+ *_base_.pre_transform,
85
+ *mosaic_affine_transform,
86
+ dict(
87
+ type='YOLOv5MultiModalMixUp',
88
+ prob=_base_.mixup_prob,
89
+ pre_transform=[*_base_.pre_transform,
90
+ *mosaic_affine_transform]),
91
+ *_base_.last_transform[:-1],
92
+ *text_transform
93
+ ]
94
+ train_pipeline_stage2 = [
95
+ *_base_.train_pipeline_stage2[:-1],
96
+ *text_transform
97
+ ]
98
+ coco_train_dataset = dict(
99
+ _delete_=True,
100
+ type='MultiModalDataset',
101
+ dataset=dict(
102
+ type='YOLOv5CocoDataset',
103
+ data_root='data/coco',
104
+ ann_file='annotations/instances_train2017.json',
105
+ data_prefix=dict(img='train2017/'),
106
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
107
+ class_text_path='data/texts/coco_class_texts.json',
108
+ pipeline=train_pipeline)
109
+
110
+ train_dataloader = dict(
111
+ persistent_workers=persistent_workers,
112
+ batch_size=train_batch_size_per_gpu,
113
+ collate_fn=dict(type='yolow_collate'),
114
+ dataset=coco_train_dataset)
115
+ test_pipeline = [
116
+ *_base_.test_pipeline[:-1],
117
+ dict(type='LoadText'),
118
+ dict(
119
+ type='mmdet.PackDetInputs',
120
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
121
+ 'scale_factor', 'pad_param', 'texts'))
122
+ ]
123
+ coco_val_dataset = dict(
124
+ _delete_=True,
125
+ type='MultiModalDataset',
126
+ dataset=dict(
127
+ type='YOLOv5CocoDataset',
128
+ data_root='data/coco',
129
+ ann_file='annotations/instances_val2017.json',
130
+ data_prefix=dict(img='val2017/'),
131
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
132
+ class_text_path='data/texts/coco_class_texts.json',
133
+ pipeline=test_pipeline)
134
+ val_dataloader = dict(dataset=coco_val_dataset)
135
+ test_dataloader = val_dataloader
136
+ # training settings
137
+ default_hooks = dict(
138
+ param_scheduler=dict(
139
+ scheduler_type='linear',
140
+ lr_factor=0.01,
141
+ max_epochs=max_epochs),
142
+ checkpoint=dict(
143
+ max_keep_ckpts=-1,
144
+ save_best=None,
145
+ interval=save_epoch_intervals))
146
+ custom_hooks = [
147
+ dict(
148
+ type='EMAHook',
149
+ ema_type='ExpMomentumEMA',
150
+ momentum=0.0001,
151
+ update_buffers=True,
152
+ strict_load=False,
153
+ priority=49),
154
+ dict(
155
+ type='mmdet.PipelineSwitchHook',
156
+ switch_epoch=max_epochs - close_mosaic_epochs,
157
+ switch_pipeline=train_pipeline_stage2)
158
+ ]
159
+ train_cfg = dict(
160
+ max_epochs=max_epochs,
161
+ val_interval=5,
162
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
163
+ _base_.val_interval_stage2)])
164
+ optim_wrapper = dict(
165
+ optimizer=dict(
166
+ _delete_=True,
167
+ type='AdamW',
168
+ lr=base_lr,
169
+ weight_decay=weight_decay,
170
+ batch_size_per_gpu=train_batch_size_per_gpu),
171
+ paramwise_cfg=dict(
172
+ custom_keys={'backbone.text_model': dict(lr_mult=0.01),
173
+ 'logit_scale': dict(weight_decay=0.0)}),
174
+ constructor='YOLOWv5OptimizerConstructor')
175
+
176
+ # evaluation settings
177
+ val_evaluator = dict(
178
+ _delete_=True,
179
+ type='mmdet.CocoMetric',
180
+ proposal_nums=(100, 1, 10),
181
+ ann_file='data/coco/annotations/instances_val2017.json',
182
+ metric='bbox')
YOLO-World/configs/finetune_coco/yolo_world_v2_s_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
4
+
5
+ # hyper-parameters
6
+ num_classes = 80
7
+ num_training_classes = 80
8
+ max_epochs = 80 # Maximum training epochs
9
+ close_mosaic_epochs = 10
10
+ save_epoch_intervals = 5
11
+ text_channels = 512
12
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
13
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
14
+ base_lr = 2e-4
15
+ weight_decay = 0.05
16
+ train_batch_size_per_gpu = 16
17
+ load_from = '../FastDet/output_models/pretrain_yolow-v8_s_clipv2_frozen_te_noprompt_t2i_bn_2e-3adamw_scale_lr_wd_32xb16-100e_obj365v1_goldg_cc3mram250k_train_lviseval-e3592307_rep_conv.pth'
18
+ persistent_workers = False
19
+ mixup_prob = 0.15
20
+ copypaste_prob = 0.3
21
+
22
+ # model settings
23
+ model = dict(type='SimpleYOLOWorldDetector',
24
+ mm_neck=True,
25
+ num_train_classes=num_classes,
26
+ num_test_classes=num_classes,
27
+ reparameterized=True,
28
+ data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'),
29
+ backbone=dict(_delete_=True,
30
+ type='MultiModalYOLOBackbone',
31
+ text_model=None,
32
+ image_model={{_base_.model.backbone}},
33
+ with_text_model=False),
34
+ neck=dict(type='YOLOWorldPAFPN',
35
+ guide_channels=text_channels,
36
+ embed_channels=neck_embed_channels,
37
+ num_heads=neck_num_heads,
38
+ block_cfg=dict(type='EfficientCSPLayerWithTwoConv')),
39
+ bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule',
40
+ embed_dims=text_channels,
41
+ num_guide=num_classes,
42
+ num_classes=num_classes)),
43
+ train_cfg=dict(assigner=dict(num_classes=num_classes)))
44
+
45
+ # dataset settings
46
+ final_transform = [
47
+ dict(type='mmdet.PackDetInputs',
48
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
49
+ 'flip_direction'))
50
+ ]
51
+ mosaic_affine_transform = [
52
+ dict(type='Mosaic',
53
+ img_scale=_base_.img_scale,
54
+ pad_val=114.0,
55
+ pre_transform=_base_.pre_transform),
56
+ dict(type='YOLOv5CopyPaste', prob=copypaste_prob),
57
+ dict(
58
+ type='YOLOv5RandomAffine',
59
+ max_rotate_degree=0.0,
60
+ max_shear_degree=0.0,
61
+ max_aspect_ratio=100.,
62
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
63
+ # img_scale is (width, height)
64
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
65
+ border_val=(114, 114, 114),
66
+ min_area_ratio=_base_.min_area_ratio,
67
+ use_mask_refine=_base_.use_mask2refine)
68
+ ]
69
+ train_pipeline = [
70
+ *_base_.pre_transform, *mosaic_affine_transform,
71
+ dict(type='YOLOv5MixUp',
72
+ prob=mixup_prob,
73
+ pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]),
74
+ *_base_.last_transform[:-1], *final_transform
75
+ ]
76
+
77
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *final_transform]
78
+
79
+ coco_train_dataset = dict(type='YOLOv5CocoDataset',
80
+ data_root='data/coco',
81
+ ann_file='annotations/instances_train2017.json',
82
+ data_prefix=dict(img='train2017/'),
83
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
84
+ pipeline=train_pipeline)
85
+
86
+ train_dataloader = dict(persistent_workers=persistent_workers,
87
+ batch_size=train_batch_size_per_gpu,
88
+ collate_fn=dict(type='yolow_collate'),
89
+ dataset=coco_train_dataset)
90
+
91
+ train_dataloader = dict(persistent_workers=persistent_workers,
92
+ batch_size=train_batch_size_per_gpu,
93
+ collate_fn=dict(type='yolow_collate'),
94
+ dataset=coco_train_dataset)
95
+ test_pipeline = [
96
+ *_base_.test_pipeline[:-1],
97
+ dict(type='mmdet.PackDetInputs',
98
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
99
+ 'scale_factor', 'pad_param'))
100
+ ]
101
+ coco_val_dataset = dict(type='YOLOv5CocoDataset',
102
+ data_root='data/coco',
103
+ ann_file='annotations/instances_val2017.json',
104
+ data_prefix=dict(img='val2017/'),
105
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
106
+ pipeline=test_pipeline)
107
+
108
+ val_dataloader = dict(dataset=coco_val_dataset)
109
+ test_dataloader = val_dataloader
110
+ # training settings
111
+ default_hooks = dict(param_scheduler=dict(scheduler_type='linear',
112
+ lr_factor=0.01,
113
+ max_epochs=max_epochs),
114
+ checkpoint=dict(max_keep_ckpts=-1,
115
+ save_best=None,
116
+ interval=save_epoch_intervals))
117
+ custom_hooks = [
118
+ dict(type='EMAHook',
119
+ ema_type='ExpMomentumEMA',
120
+ momentum=0.0001,
121
+ update_buffers=True,
122
+ strict_load=False,
123
+ priority=49),
124
+ dict(type='mmdet.PipelineSwitchHook',
125
+ switch_epoch=max_epochs - close_mosaic_epochs,
126
+ switch_pipeline=train_pipeline_stage2)
127
+ ]
128
+ train_cfg = dict(max_epochs=max_epochs,
129
+ val_interval=5,
130
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
131
+ _base_.val_interval_stage2)])
132
+ optim_wrapper = dict(optimizer=dict(
133
+ _delete_=True,
134
+ type='AdamW',
135
+ lr=base_lr,
136
+ weight_decay=weight_decay,
137
+ batch_size_per_gpu=train_batch_size_per_gpu),
138
+ constructor='YOLOWv5OptimizerConstructor')
139
+
140
+ # evaluation settings
141
+ val_evaluator = dict(_delete_=True,
142
+ type='mmdet.CocoMetric',
143
+ proposal_nums=(100, 1, 10),
144
+ ann_file='data/coco/annotations/instances_val2017.json',
145
+ metric='bbox')
YOLO-World/configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
4
+
5
+ # hyper-parameters
6
+ num_classes = 80
7
+ num_training_classes = 80
8
+ max_epochs = 80 # Maximum training epochs
9
+ close_mosaic_epochs = 10
10
+ save_epoch_intervals = 5
11
+ text_channels = 512
12
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
13
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
14
+ base_lr = 2e-4
15
+ weight_decay = 0.05
16
+ train_batch_size_per_gpu = 16
17
+ load_from = '../FastDet/output_models/yolo_world_s_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-55b943ea_rep_conv.pth'
18
+ persistent_workers = False
19
+ mixup_prob = 0.15
20
+ copypaste_prob = 0.3
21
+
22
+ # model settings
23
+ model = dict(type='SimpleYOLOWorldDetector',
24
+ mm_neck=True,
25
+ num_train_classes=num_classes,
26
+ num_test_classes=num_classes,
27
+ reparameterized=True,
28
+ data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'),
29
+ backbone=dict(_delete_=True,
30
+ type='MultiModalYOLOBackbone',
31
+ text_model=None,
32
+ image_model={{_base_.model.backbone}},
33
+ with_text_model=False),
34
+ neck=dict(type='YOLOWorldPAFPN',
35
+ guide_channels=num_classes,
36
+ embed_channels=neck_embed_channels,
37
+ num_heads=neck_num_heads,
38
+ block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv',
39
+ guide_channels=num_classes)),
40
+ bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule',
41
+ embed_dims=text_channels,
42
+ num_guide=num_classes,
43
+ num_classes=num_classes)),
44
+ train_cfg=dict(assigner=dict(num_classes=num_classes)))
45
+
46
+ # dataset settings
47
+ final_transform = [
48
+ dict(type='mmdet.PackDetInputs',
49
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
50
+ 'flip_direction'))
51
+ ]
52
+ mosaic_affine_transform = [
53
+ dict(type='Mosaic',
54
+ img_scale=_base_.img_scale,
55
+ pad_val=114.0,
56
+ pre_transform=_base_.pre_transform),
57
+ dict(type='YOLOv5CopyPaste', prob=copypaste_prob),
58
+ dict(
59
+ type='YOLOv5RandomAffine',
60
+ max_rotate_degree=0.0,
61
+ max_shear_degree=0.0,
62
+ max_aspect_ratio=100.,
63
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
64
+ # img_scale is (width, height)
65
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
66
+ border_val=(114, 114, 114),
67
+ min_area_ratio=_base_.min_area_ratio,
68
+ use_mask_refine=_base_.use_mask2refine)
69
+ ]
70
+ train_pipeline = [
71
+ *_base_.pre_transform, *mosaic_affine_transform,
72
+ dict(type='YOLOv5MixUp',
73
+ prob=mixup_prob,
74
+ pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]),
75
+ *_base_.last_transform[:-1], *final_transform
76
+ ]
77
+
78
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *final_transform]
79
+
80
+ coco_train_dataset = dict(type='YOLOv5CocoDataset',
81
+ data_root='data/coco',
82
+ ann_file='annotations/instances_train2017.json',
83
+ data_prefix=dict(img='train2017/'),
84
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
85
+ pipeline=train_pipeline)
86
+
87
+ train_dataloader = dict(persistent_workers=persistent_workers,
88
+ batch_size=train_batch_size_per_gpu,
89
+ collate_fn=dict(type='yolow_collate'),
90
+ dataset=coco_train_dataset)
91
+
92
+ train_dataloader = dict(persistent_workers=persistent_workers,
93
+ batch_size=train_batch_size_per_gpu,
94
+ collate_fn=dict(type='yolow_collate'),
95
+ dataset=coco_train_dataset)
96
+ test_pipeline = [
97
+ *_base_.test_pipeline[:-1],
98
+ dict(type='mmdet.PackDetInputs',
99
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
100
+ 'scale_factor', 'pad_param'))
101
+ ]
102
+ coco_val_dataset = dict(type='YOLOv5CocoDataset',
103
+ data_root='data/coco',
104
+ ann_file='annotations/instances_val2017.json',
105
+ data_prefix=dict(img='val2017/'),
106
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
107
+ pipeline=test_pipeline)
108
+
109
+ val_dataloader = dict(dataset=coco_val_dataset)
110
+ test_dataloader = val_dataloader
111
+ # training settings
112
+ default_hooks = dict(param_scheduler=dict(scheduler_type='linear',
113
+ lr_factor=0.01,
114
+ max_epochs=max_epochs),
115
+ checkpoint=dict(max_keep_ckpts=-1,
116
+ save_best=None,
117
+ interval=save_epoch_intervals))
118
+ custom_hooks = [
119
+ dict(type='EMAHook',
120
+ ema_type='ExpMomentumEMA',
121
+ momentum=0.0001,
122
+ update_buffers=True,
123
+ strict_load=False,
124
+ priority=49),
125
+ dict(type='mmdet.PipelineSwitchHook',
126
+ switch_epoch=max_epochs - close_mosaic_epochs,
127
+ switch_pipeline=train_pipeline_stage2)
128
+ ]
129
+ train_cfg = dict(max_epochs=max_epochs,
130
+ val_interval=5,
131
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
132
+ _base_.val_interval_stage2)])
133
+ optim_wrapper = dict(optimizer=dict(
134
+ _delete_=True,
135
+ type='AdamW',
136
+ lr=base_lr,
137
+ weight_decay=weight_decay,
138
+ batch_size_per_gpu=train_batch_size_per_gpu),
139
+ constructor='YOLOWv5OptimizerConstructor')
140
+
141
+ # evaluation settings
142
+ val_evaluator = dict(_delete_=True,
143
+ type='mmdet.CocoMetric',
144
+ proposal_nums=(100, 1, 10),
145
+ ann_file='data/coco/annotations/instances_val2017.json',
146
+ metric='bbox')
YOLO-World/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = (
2
+ '../../third_party/mmyolo/configs/yolov8/'
3
+ 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py')
4
+ custom_imports = dict(
5
+ imports=['yolo_world'],
6
+ allow_failed_imports=False)
7
+
8
+ # hyper-parameters
9
+ num_classes = 80
10
+ num_training_classes = 80
11
+ max_epochs = 80 # Maximum training epochs
12
+ close_mosaic_epochs = 10
13
+ save_epoch_intervals = 5
14
+ text_channels = 512
15
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
16
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
17
+ base_lr = 2e-4
18
+ weight_decay = 0.05
19
+ train_batch_size_per_gpu = 16
20
+ load_from = 'pretrained_models/yolo_world_s_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-55b943ea.pth'
21
+ text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
22
+ text_model_name = 'openai/clip-vit-base-patch32'
23
+ persistent_workers = False
24
+ mixup_prob = 0.15
25
+ copypaste_prob = 0.3
26
+
27
+ # model settings
28
+ model = dict(
29
+ type='YOLOWorldDetector',
30
+ mm_neck=True,
31
+ num_train_classes=num_training_classes,
32
+ num_test_classes=num_classes,
33
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
34
+ backbone=dict(
35
+ _delete_=True,
36
+ type='MultiModalYOLOBackbone',
37
+ image_model={{_base_.model.backbone}},
38
+ text_model=dict(
39
+ type='HuggingCLIPLanguageBackbone',
40
+ model_name=text_model_name,
41
+ frozen_modules=['all'])),
42
+ neck=dict(type='YOLOWorldPAFPN',
43
+ guide_channels=text_channels,
44
+ embed_channels=neck_embed_channels,
45
+ num_heads=neck_num_heads,
46
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
47
+ bbox_head=dict(type='YOLOWorldHead',
48
+ head_module=dict(type='YOLOWorldHeadModule',
49
+ use_bn_head=True,
50
+ embed_dims=text_channels,
51
+ num_classes=num_training_classes)),
52
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
53
+
54
+ # dataset settings
55
+ text_transform = [
56
+ dict(type='RandomLoadText',
57
+ num_neg_samples=(num_classes, num_classes),
58
+ max_num_samples=num_training_classes,
59
+ padding_to_max=True,
60
+ padding_value=''),
61
+ dict(type='mmdet.PackDetInputs',
62
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
63
+ 'flip_direction', 'texts'))
64
+ ]
65
+ mosaic_affine_transform = [
66
+ dict(
67
+ type='MultiModalMosaic',
68
+ img_scale=_base_.img_scale,
69
+ pad_val=114.0,
70
+ pre_transform=_base_.pre_transform),
71
+ dict(type='YOLOv5CopyPaste', prob=copypaste_prob),
72
+ dict(
73
+ type='YOLOv5RandomAffine',
74
+ max_rotate_degree=0.0,
75
+ max_shear_degree=0.0,
76
+ max_aspect_ratio=100.,
77
+ scaling_ratio_range=(1 - _base_.affine_scale,
78
+ 1 + _base_.affine_scale),
79
+ # img_scale is (width, height)
80
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
81
+ border_val=(114, 114, 114),
82
+ min_area_ratio=_base_.min_area_ratio,
83
+ use_mask_refine=_base_.use_mask2refine)
84
+ ]
85
+ train_pipeline = [
86
+ *_base_.pre_transform,
87
+ *mosaic_affine_transform,
88
+ dict(
89
+ type='YOLOv5MultiModalMixUp',
90
+ prob=mixup_prob,
91
+ pre_transform=[*_base_.pre_transform,
92
+ *mosaic_affine_transform]),
93
+ *_base_.last_transform[:-1],
94
+ *text_transform
95
+ ]
96
+ train_pipeline_stage2 = [
97
+ *_base_.train_pipeline_stage2[:-1],
98
+ *text_transform
99
+ ]
100
+ coco_train_dataset = dict(
101
+ _delete_=True,
102
+ type='MultiModalDataset',
103
+ dataset=dict(
104
+ type='YOLOv5CocoDataset',
105
+ data_root='data/coco',
106
+ ann_file='annotations/instances_train2017.json',
107
+ data_prefix=dict(img='train2017/'),
108
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
109
+ class_text_path='data/texts/coco_class_texts.json',
110
+ pipeline=train_pipeline)
111
+
112
+ train_dataloader = dict(
113
+ persistent_workers=persistent_workers,
114
+ batch_size=train_batch_size_per_gpu,
115
+ collate_fn=dict(type='yolow_collate'),
116
+ dataset=coco_train_dataset)
117
+ test_pipeline = [
118
+ *_base_.test_pipeline[:-1],
119
+ dict(type='LoadText'),
120
+ dict(
121
+ type='mmdet.PackDetInputs',
122
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
123
+ 'scale_factor', 'pad_param', 'texts'))
124
+ ]
125
+ coco_val_dataset = dict(
126
+ _delete_=True,
127
+ type='MultiModalDataset',
128
+ dataset=dict(
129
+ type='YOLOv5CocoDataset',
130
+ data_root='data/coco',
131
+ ann_file='annotations/instances_val2017.json',
132
+ data_prefix=dict(img='val2017/'),
133
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
134
+ class_text_path='data/texts/coco_class_texts.json',
135
+ pipeline=test_pipeline)
136
+ val_dataloader = dict(dataset=coco_val_dataset)
137
+ test_dataloader = val_dataloader
138
+ # training settings
139
+ default_hooks = dict(
140
+ param_scheduler=dict(
141
+ scheduler_type='linear',
142
+ lr_factor=0.01,
143
+ max_epochs=max_epochs),
144
+ checkpoint=dict(
145
+ max_keep_ckpts=-1,
146
+ save_best=None,
147
+ interval=save_epoch_intervals))
148
+ custom_hooks = [
149
+ dict(
150
+ type='EMAHook',
151
+ ema_type='ExpMomentumEMA',
152
+ momentum=0.0001,
153
+ update_buffers=True,
154
+ strict_load=False,
155
+ priority=49),
156
+ dict(
157
+ type='mmdet.PipelineSwitchHook',
158
+ switch_epoch=max_epochs - close_mosaic_epochs,
159
+ switch_pipeline=train_pipeline_stage2)
160
+ ]
161
+ train_cfg = dict(
162
+ max_epochs=max_epochs,
163
+ val_interval=5,
164
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
165
+ _base_.val_interval_stage2)])
166
+ optim_wrapper = dict(
167
+ optimizer=dict(
168
+ _delete_=True,
169
+ type='AdamW',
170
+ lr=base_lr,
171
+ weight_decay=weight_decay,
172
+ batch_size_per_gpu=train_batch_size_per_gpu),
173
+ paramwise_cfg=dict(
174
+ custom_keys={'backbone.text_model': dict(lr_mult=0.01),
175
+ 'logit_scale': dict(weight_decay=0.0)}),
176
+ constructor='YOLOWv5OptimizerConstructor')
177
+
178
+ # evaluation settings
179
+ val_evaluator = dict(
180
+ _delete_=True,
181
+ type='mmdet.CocoMetric',
182
+ proposal_nums=(100, 1, 10),
183
+ ann_file='data/coco/annotations/instances_val2017.json',
184
+ metric='bbox')
YOLO-World/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = (
2
+ '../../third_party/mmyolo/configs/yolov8/'
3
+ 'yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py')
4
+ custom_imports = dict(
5
+ imports=['yolo_world'],
6
+ allow_failed_imports=False)
7
+
8
+ # hyper-parameters
9
+ num_classes = 80
10
+ num_training_classes = 80
11
+ max_epochs = 80 # Maximum training epochs
12
+ close_mosaic_epochs = 10
13
+ save_epoch_intervals = 5
14
+ text_channels = 512
15
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
16
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
17
+ base_lr = 2e-4
18
+ weight_decay = 0.05
19
+ train_batch_size_per_gpu = 16
20
+ load_from = 'pretrained_models/yolo_world_x_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc250k_train_lviseval-8698fbfa.pth'
21
+ text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
22
+ text_model_name = 'openai/clip-vit-base-patch32'
23
+ persistent_workers = False
24
+
25
+ # model settings
26
+ model = dict(
27
+ type='YOLOWorldDetector',
28
+ mm_neck=True,
29
+ num_train_classes=num_training_classes,
30
+ num_test_classes=num_classes,
31
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
32
+ backbone=dict(
33
+ _delete_=True,
34
+ type='MultiModalYOLOBackbone',
35
+ image_model={{_base_.model.backbone}},
36
+ text_model=dict(
37
+ type='HuggingCLIPLanguageBackbone',
38
+ model_name=text_model_name,
39
+ frozen_modules=['all'])),
40
+ neck=dict(type='YOLOWorldPAFPN',
41
+ guide_channels=text_channels,
42
+ embed_channels=neck_embed_channels,
43
+ num_heads=neck_num_heads,
44
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
45
+ bbox_head=dict(type='YOLOWorldHead',
46
+ head_module=dict(type='YOLOWorldHeadModule',
47
+ use_bn_head=True,
48
+ embed_dims=text_channels,
49
+ num_classes=num_training_classes)),
50
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
51
+
52
+ # dataset settings
53
+ text_transform = [
54
+ dict(type='RandomLoadText',
55
+ num_neg_samples=(num_classes, num_classes),
56
+ max_num_samples=num_training_classes,
57
+ padding_to_max=True,
58
+ padding_value=''),
59
+ dict(type='mmdet.PackDetInputs',
60
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
61
+ 'flip_direction', 'texts'))
62
+ ]
63
+ mosaic_affine_transform = [
64
+ dict(
65
+ type='MultiModalMosaic',
66
+ img_scale=_base_.img_scale,
67
+ pad_val=114.0,
68
+ pre_transform=_base_.pre_transform),
69
+ dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob),
70
+ dict(
71
+ type='YOLOv5RandomAffine',
72
+ max_rotate_degree=0.0,
73
+ max_shear_degree=0.0,
74
+ max_aspect_ratio=100.,
75
+ scaling_ratio_range=(1 - _base_.affine_scale,
76
+ 1 + _base_.affine_scale),
77
+ # img_scale is (width, height)
78
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
79
+ border_val=(114, 114, 114),
80
+ min_area_ratio=_base_.min_area_ratio,
81
+ use_mask_refine=_base_.use_mask2refine)
82
+ ]
83
+ train_pipeline = [
84
+ *_base_.pre_transform,
85
+ *mosaic_affine_transform,
86
+ dict(
87
+ type='YOLOv5MultiModalMixUp',
88
+ prob=_base_.mixup_prob,
89
+ pre_transform=[*_base_.pre_transform,
90
+ *mosaic_affine_transform]),
91
+ *_base_.last_transform[:-1],
92
+ *text_transform
93
+ ]
94
+ train_pipeline_stage2 = [
95
+ *_base_.train_pipeline_stage2[:-1],
96
+ *text_transform
97
+ ]
98
+ coco_train_dataset = dict(
99
+ _delete_=True,
100
+ type='MultiModalDataset',
101
+ dataset=dict(
102
+ type='YOLOv5CocoDataset',
103
+ data_root='data/coco',
104
+ ann_file='annotations/instances_train2017.json',
105
+ data_prefix=dict(img='train2017/'),
106
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
107
+ class_text_path='data/texts/coco_class_texts.json',
108
+ pipeline=train_pipeline)
109
+
110
+ train_dataloader = dict(
111
+ persistent_workers=persistent_workers,
112
+ batch_size=train_batch_size_per_gpu,
113
+ collate_fn=dict(type='yolow_collate'),
114
+ dataset=coco_train_dataset)
115
+
116
+ test_pipeline = [
117
+ *_base_.test_pipeline[:-1],
118
+ dict(type='LoadText'),
119
+ dict(
120
+ type='mmdet.PackDetInputs',
121
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
122
+ 'scale_factor', 'pad_param', 'texts'))
123
+ ]
124
+
125
+ coco_val_dataset = dict(
126
+ _delete_=True,
127
+ type='MultiModalDataset',
128
+ dataset=dict(
129
+ type='YOLOv5CocoDataset',
130
+ data_root='data/coco',
131
+ ann_file='annotations/instances_val2017.json',
132
+ data_prefix=dict(img='val2017/'),
133
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
134
+ class_text_path='data/texts/coco_class_texts.json',
135
+ pipeline=test_pipeline)
136
+ val_dataloader = dict(dataset=coco_val_dataset)
137
+ test_dataloader = val_dataloader
138
+ # training settings
139
+ default_hooks = dict(
140
+ param_scheduler=dict(
141
+ scheduler_type='linear',
142
+ lr_factor=0.01,
143
+ max_epochs=max_epochs),
144
+ checkpoint=dict(
145
+ max_keep_ckpts=-1,
146
+ save_best=None,
147
+ interval=save_epoch_intervals))
148
+ custom_hooks = [
149
+ dict(
150
+ type='EMAHook',
151
+ ema_type='ExpMomentumEMA',
152
+ momentum=0.0001,
153
+ update_buffers=True,
154
+ strict_load=False,
155
+ priority=49),
156
+ dict(
157
+ type='mmdet.PipelineSwitchHook',
158
+ switch_epoch=max_epochs - close_mosaic_epochs,
159
+ switch_pipeline=train_pipeline_stage2)
160
+ ]
161
+ train_cfg = dict(
162
+ max_epochs=max_epochs,
163
+ val_interval=5,
164
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
165
+ _base_.val_interval_stage2)])
166
+ optim_wrapper = dict(
167
+ optimizer=dict(
168
+ _delete_=True,
169
+ type='AdamW',
170
+ lr=base_lr,
171
+ weight_decay=weight_decay,
172
+ batch_size_per_gpu=train_batch_size_per_gpu),
173
+ paramwise_cfg=dict(
174
+ custom_keys={'backbone.text_model': dict(lr_mult=0.01),
175
+ 'logit_scale': dict(weight_decay=0.0)}),
176
+ constructor='YOLOWv5OptimizerConstructor')
177
+ # evaluation settings
178
+ val_evaluator = dict(
179
+ _delete_=True,
180
+ type='mmdet.CocoMetric',
181
+ proposal_nums=(100, 1, 10),
182
+ ann_file='data/coco/annotations/instances_val2017.json',
183
+ metric='bbox')
YOLO-World/configs/finetune_coco/yolo_world_v2_xl_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
4
+
5
+ # hyper-parameters
6
+ num_classes = 80
7
+ num_training_classes = 80
8
+ max_epochs = 80 # Maximum training epochs
9
+ close_mosaic_epochs = 10
10
+ save_epoch_intervals = 5
11
+ text_channels = 512
12
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
13
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
14
+ base_lr = 2e-4
15
+ weight_decay = 0.05
16
+ train_batch_size_per_gpu = 16
17
+ text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
18
+ text_model_name = 'openai/clip-vit-base-patch32'
19
+ persistent_workers = False
20
+
21
+ # scaling model from X to XL
22
+ deepen_factor = 1.0
23
+ widen_factor = 1.5
24
+
25
+ backbone = _base_.model.backbone
26
+ backbone.update(deepen_factor=deepen_factor, widen_factor=widen_factor)
27
+
28
+ # model settings
29
+ model = dict(type='YOLOWorldDetector',
30
+ mm_neck=True,
31
+ num_train_classes=num_training_classes,
32
+ num_test_classes=num_classes,
33
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
34
+ backbone=dict(_delete_=True,
35
+ type='MultiModalYOLOBackbone',
36
+ image_model=backbone,
37
+ text_model=dict(type='HuggingCLIPLanguageBackbone',
38
+ model_name=text_model_name,
39
+ frozen_modules=['all'])),
40
+ neck=dict(type='YOLOWorldPAFPN',
41
+ deepen_factor=deepen_factor,
42
+ widen_factor=widen_factor,
43
+ guide_channels=text_channels,
44
+ embed_channels=neck_embed_channels,
45
+ num_heads=neck_num_heads,
46
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
47
+ bbox_head=dict(type='YOLOWorldHead',
48
+ head_module=dict(
49
+ type='YOLOWorldHeadModule',
50
+ widen_factor=widen_factor,
51
+ use_bn_head=True,
52
+ embed_dims=text_channels,
53
+ num_classes=num_training_classes)),
54
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
55
+
56
+ # dataset settings
57
+ text_transform = [
58
+ dict(type='RandomLoadText',
59
+ num_neg_samples=(num_classes, num_classes),
60
+ max_num_samples=num_training_classes,
61
+ padding_to_max=True,
62
+ padding_value=''),
63
+ dict(type='mmdet.PackDetInputs',
64
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
65
+ 'flip_direction', 'texts'))
66
+ ]
67
+ mosaic_affine_transform = [
68
+ dict(type='MultiModalMosaic',
69
+ img_scale=_base_.img_scale,
70
+ pad_val=114.0,
71
+ pre_transform=_base_.pre_transform),
72
+ dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob),
73
+ dict(
74
+ type='YOLOv5RandomAffine',
75
+ max_rotate_degree=0.0,
76
+ max_shear_degree=0.0,
77
+ max_aspect_ratio=100.,
78
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
79
+ # img_scale is (width, height)
80
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
81
+ border_val=(114, 114, 114),
82
+ min_area_ratio=_base_.min_area_ratio,
83
+ use_mask_refine=_base_.use_mask2refine)
84
+ ]
85
+ train_pipeline = [
86
+ *_base_.pre_transform, *mosaic_affine_transform,
87
+ dict(type='YOLOv5MultiModalMixUp',
88
+ prob=_base_.mixup_prob,
89
+ pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]),
90
+ *_base_.last_transform[:-1], *text_transform
91
+ ]
92
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
93
+ coco_train_dataset = dict(_delete_=True,
94
+ type='MultiModalDataset',
95
+ dataset=dict(
96
+ type='YOLOv5CocoDataset',
97
+ data_root='data/coco',
98
+ ann_file='annotations/instances_train2017.json',
99
+ data_prefix=dict(img='train2017/'),
100
+ filter_cfg=dict(filter_empty_gt=False,
101
+ min_size=32)),
102
+ class_text_path='data/texts/coco_class_texts.json',
103
+ pipeline=train_pipeline)
104
+
105
+ train_dataloader = dict(persistent_workers=persistent_workers,
106
+ batch_size=train_batch_size_per_gpu,
107
+ collate_fn=dict(type='yolow_collate'),
108
+ dataset=coco_train_dataset)
109
+
110
+ test_pipeline = [
111
+ *_base_.test_pipeline[:-1],
112
+ dict(type='LoadText'),
113
+ dict(type='mmdet.PackDetInputs',
114
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
115
+ 'scale_factor', 'pad_param', 'texts'))
116
+ ]
117
+
118
+ coco_val_dataset = dict(
119
+ _delete_=True,
120
+ type='MultiModalDataset',
121
+ dataset=dict(type='YOLOv5CocoDataset',
122
+ data_root='data/coco',
123
+ ann_file='annotations/instances_val2017.json',
124
+ data_prefix=dict(img='val2017/'),
125
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
126
+ class_text_path='data/texts/coco_class_texts.json',
127
+ pipeline=test_pipeline)
128
+ val_dataloader = dict(dataset=coco_val_dataset)
129
+ test_dataloader = val_dataloader
130
+ # training settings
131
+ default_hooks = dict(param_scheduler=dict(scheduler_type='linear',
132
+ lr_factor=0.01,
133
+ max_epochs=max_epochs),
134
+ checkpoint=dict(max_keep_ckpts=-1,
135
+ save_best=None,
136
+ interval=save_epoch_intervals))
137
+ custom_hooks = [
138
+ dict(type='EMAHook',
139
+ ema_type='ExpMomentumEMA',
140
+ momentum=0.0001,
141
+ update_buffers=True,
142
+ strict_load=False,
143
+ priority=49),
144
+ dict(type='mmdet.PipelineSwitchHook',
145
+ switch_epoch=max_epochs - close_mosaic_epochs,
146
+ switch_pipeline=train_pipeline_stage2)
147
+ ]
148
+ train_cfg = dict(max_epochs=max_epochs,
149
+ val_interval=5,
150
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
151
+ _base_.val_interval_stage2)])
152
+ optim_wrapper = dict(optimizer=dict(
153
+ _delete_=True,
154
+ type='AdamW',
155
+ lr=base_lr,
156
+ weight_decay=weight_decay,
157
+ batch_size_per_gpu=train_batch_size_per_gpu),
158
+ paramwise_cfg=dict(bias_decay_mult=0.0,
159
+ norm_decay_mult=0.0,
160
+ custom_keys={
161
+ 'backbone.text_model':
162
+ dict(lr_mult=0.01),
163
+ 'logit_scale':
164
+ dict(weight_decay=0.0)
165
+ }),
166
+ constructor='YOLOWv5OptimizerConstructor')
167
+
168
+ # evaluation settings
169
+ val_evaluator = dict(_delete_=True,
170
+ type='mmdet.CocoMetric',
171
+ proposal_nums=(100, 1, 10),
172
+ ann_file='data/coco/annotations/instances_val2017.json',
173
+ metric='bbox')
YOLO-World/configs/image_prompts/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_image_prompt_demo.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_l_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
4
+
5
+ # hyper-parameters
6
+ num_classes = 80
7
+ num_training_classes = 80
8
+ max_epochs = 80 # Maximum training epochs
9
+ close_mosaic_epochs = 10
10
+ save_epoch_intervals = 5
11
+ text_channels = 512
12
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
13
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
14
+ base_lr = 2e-4
15
+ weight_decay = 0.05
16
+ train_batch_size_per_gpu = 16
17
+ load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth'
18
+ persistent_workers = False
19
+ text_model_name = '../pretrained_models/open-ai-clip-vit-base-patch32'
20
+ img_scale = (800, 800)
21
+
22
+ # model settings
23
+ model = dict(type='YOLOWorldImageDetector',
24
+ mm_neck=True,
25
+ num_train_classes=num_training_classes,
26
+ num_test_classes=num_classes,
27
+ vision_model=text_model_name,
28
+ prompt_dim=text_channels,
29
+ data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'),
30
+ backbone=dict(_delete_=True,
31
+ type='MultiModalYOLOBackbone',
32
+ image_model={{_base_.model.backbone}},
33
+ frozen_stages=4,
34
+ text_model=dict(type='HuggingCLIPLanguageBackbone',
35
+ model_name=text_model_name,
36
+ frozen_modules=['all'])),
37
+ neck=dict(type='YOLOWorldPAFPN',
38
+ freeze_all=True,
39
+ guide_channels=text_channels,
40
+ embed_channels=neck_embed_channels,
41
+ num_heads=neck_num_heads,
42
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
43
+ bbox_head=dict(type='YOLOWorldHead',
44
+ head_module=dict(
45
+ type='YOLOWorldHeadModule',
46
+ freeze_all=True,
47
+ use_bn_head=True,
48
+ embed_dims=text_channels,
49
+ num_classes=num_training_classes)),
50
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
51
+
52
+ # dataset settings
53
+ coco_train_dataset = dict(type='YOLOv5CocoDataset',
54
+ data_root='data/coco',
55
+ ann_file='annotations/instances_train2017.json',
56
+ data_prefix=dict(img='train2017/'),
57
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
58
+ pipeline=_base_.train_pipeline)
59
+
60
+ train_dataloader = dict(persistent_workers=persistent_workers,
61
+ batch_size=train_batch_size_per_gpu,
62
+ collate_fn=dict(type='yolow_collate'),
63
+ dataset=coco_train_dataset)
64
+ test_pipeline = [
65
+ *_base_.test_pipeline[:-1],
66
+ dict(type='LoadText'),
67
+ dict(
68
+ type='mmdet.PackDetInputs',
69
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
70
+ 'scale_factor', 'pad_param', 'texts'))
71
+ ]
72
+ coco_val_dataset = dict(
73
+ _delete_=True,
74
+ type='MultiModalDataset',
75
+ dataset=dict(
76
+ type='YOLOv5CocoDataset',
77
+ data_root='data/coco',
78
+ ann_file='annotations/instances_val2017.json',
79
+ data_prefix=dict(img='val2017/'),
80
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
81
+ class_text_path='data/texts/coco_class_texts.json',
82
+ pipeline=test_pipeline)
83
+ val_dataloader = dict(dataset=coco_val_dataset)
84
+ test_dataloader = val_dataloader
85
+ # training settings
86
+ default_hooks = dict(param_scheduler=dict(scheduler_type='linear',
87
+ lr_factor=0.01,
88
+ max_epochs=max_epochs),
89
+ checkpoint=dict(max_keep_ckpts=-1,
90
+ save_best=None,
91
+ interval=save_epoch_intervals))
92
+ custom_hooks = [
93
+ dict(type='EMAHook',
94
+ ema_type='ExpMomentumEMA',
95
+ momentum=0.0001,
96
+ update_buffers=True,
97
+ strict_load=False,
98
+ priority=49),
99
+ dict(type='mmdet.PipelineSwitchHook',
100
+ switch_epoch=max_epochs - close_mosaic_epochs,
101
+ switch_pipeline=_base_.train_pipeline_stage2)
102
+ ]
103
+ train_cfg = dict(max_epochs=max_epochs,
104
+ val_interval=5,
105
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
106
+ _base_.val_interval_stage2)])
107
+
108
+ optim_wrapper = dict(optimizer=dict(
109
+ _delete_=True,
110
+ type='AdamW',
111
+ lr=base_lr,
112
+ weight_decay=weight_decay,
113
+ batch_size_per_gpu=train_batch_size_per_gpu),
114
+ paramwise_cfg=dict(
115
+ custom_keys={
116
+ 'backbone.text_model': dict(lr_mult=0.01),
117
+ 'logit_scale': dict(weight_decay=0.0),
118
+ 'embeddings': dict(weight_decay=0.0)
119
+ }),
120
+ constructor='YOLOWv5OptimizerConstructor')
121
+
122
+ # evaluation settings
123
+ val_evaluator = dict(_delete_=True,
124
+ type='mmdet.CocoMetric',
125
+ proposal_nums=(100, 1, 10),
126
+ ann_file='data/coco/annotations/instances_val2017.json',
127
+ metric='bbox')
YOLO-World/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_l_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'],
4
+ allow_failed_imports=False)
5
+
6
+ # hyper-parameters
7
+ num_classes = 1203
8
+ num_training_classes = 80
9
+ max_epochs = 100 # Maximum training epochs
10
+ close_mosaic_epochs = 2
11
+ save_epoch_intervals = 2
12
+ text_channels = 768
13
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
14
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
15
+ base_lr = 2e-3
16
+ weight_decay = 0.0125
17
+ train_batch_size_per_gpu = 16
18
+ # text_model_name = '../pretrained_models/clip-vit-large-patch14-336'
19
+ text_model_name = 'openai/clip-vit-large-patch14-336'
20
+ img_scale = (800, 800)
21
+
22
+ # model settings
23
+ model = dict(
24
+ type='YOLOWorldDetector',
25
+ mm_neck=True,
26
+ num_train_classes=num_training_classes,
27
+ num_test_classes=num_classes,
28
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
29
+ backbone=dict(
30
+ _delete_=True,
31
+ type='MultiModalYOLOBackbone',
32
+ image_model={{_base_.model.backbone}},
33
+ text_model=dict(
34
+ type='HuggingCLIPLanguageBackbone',
35
+ model_name=text_model_name,
36
+ frozen_modules=['all'])),
37
+ neck=dict(type='YOLOWorldPAFPN',
38
+ guide_channels=text_channels,
39
+ embed_channels=neck_embed_channels,
40
+ num_heads=neck_num_heads,
41
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
42
+ bbox_head=dict(type='YOLOWorldHead',
43
+ head_module=dict(type='YOLOWorldHeadModule',
44
+ use_bn_head=True,
45
+ embed_dims=text_channels,
46
+ num_classes=num_training_classes)),
47
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
48
+
49
+ # dataset settings
50
+ text_transform = [
51
+ dict(type='RandomLoadText',
52
+ num_neg_samples=(num_classes, num_classes),
53
+ max_num_samples=num_training_classes,
54
+ padding_to_max=True,
55
+ padding_value=''),
56
+ dict(type='mmdet.PackDetInputs',
57
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
58
+ 'flip_direction', 'texts'))
59
+ ]
60
+ train_pipeline = [
61
+ *_base_.pre_transform,
62
+ dict(type='MultiModalMosaic',
63
+ img_scale=img_scale,
64
+ pad_val=114.0,
65
+ pre_transform=_base_.pre_transform),
66
+ dict(
67
+ type='YOLOv5RandomAffine',
68
+ max_rotate_degree=0.0,
69
+ max_shear_degree=0.0,
70
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
71
+ max_aspect_ratio=_base_.max_aspect_ratio,
72
+ border=(-img_scale[0] // 2, -img_scale[1] // 2),
73
+ border_val=(114, 114, 114)),
74
+ *_base_.last_transform[:-1],
75
+ *text_transform,
76
+ ]
77
+
78
+ train_pipeline_stage2 = [
79
+ *_base_.pre_transform,
80
+ dict(type='YOLOv5KeepRatioResize', scale=img_scale),
81
+ dict(
82
+ type='LetterResize',
83
+ scale=img_scale,
84
+ allow_scale_up=True,
85
+ pad_val=dict(img=114.0)),
86
+ dict(
87
+ type='YOLOv5RandomAffine',
88
+ max_rotate_degree=0.0,
89
+ max_shear_degree=0.0,
90
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
91
+ max_aspect_ratio=_base_.max_aspect_ratio,
92
+ border_val=(114, 114, 114)),
93
+ *_base_.last_transform[:-1],
94
+ *text_transform
95
+ ]
96
+
97
+ obj365v1_train_dataset = dict(
98
+ type='MultiModalDataset',
99
+ dataset=dict(
100
+ type='YOLOv5Objects365V1Dataset',
101
+ data_root='data/objects365v1/',
102
+ ann_file='annotations/objects365_train.json',
103
+ data_prefix=dict(img='train/'),
104
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
105
+ class_text_path='data/texts/obj365v1_class_texts.json',
106
+ pipeline=train_pipeline)
107
+
108
+ mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset',
109
+ data_root='data/mixed_grounding/',
110
+ ann_file='annotations/final_mixed_train_no_coco.json',
111
+ data_prefix=dict(img='gqa/images/'),
112
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
113
+ pipeline=train_pipeline)
114
+
115
+ flickr_train_dataset = dict(
116
+ type='YOLOv5MixedGroundingDataset',
117
+ data_root='data/flickr/',
118
+ ann_file='annotations/final_flickr_separateGT_train.json',
119
+ data_prefix=dict(img='full_images/'),
120
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
121
+ pipeline=train_pipeline)
122
+
123
+ train_dataloader = dict(batch_size=train_batch_size_per_gpu,
124
+ collate_fn=dict(type='yolow_collate'),
125
+ dataset=dict(_delete_=True,
126
+ type='ConcatDataset',
127
+ datasets=[
128
+ obj365v1_train_dataset,
129
+ flickr_train_dataset, mg_train_dataset
130
+ ],
131
+ ignore_keys=['classes', 'palette']))
132
+
133
+ test_pipeline = [
134
+ dict(type='LoadImageFromFile'),
135
+ dict(type='YOLOv5KeepRatioResize', scale=img_scale),
136
+ dict(
137
+ type='LetterResize',
138
+ scale=img_scale,
139
+ allow_scale_up=False,
140
+ pad_val=dict(img=114)),
141
+ dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
142
+ dict(type='LoadText'),
143
+ dict(type='mmdet.PackDetInputs',
144
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
145
+ 'scale_factor', 'pad_param', 'texts'))
146
+ ]
147
+
148
+ coco_val_dataset = dict(
149
+ _delete_=True,
150
+ type='MultiModalDataset',
151
+ dataset=dict(type='YOLOv5LVISV1Dataset',
152
+ data_root='data/coco/',
153
+ test_mode=True,
154
+ ann_file='lvis/lvis_v1_minival_inserted_image_name.json',
155
+ data_prefix=dict(img=''),
156
+ batch_shapes_cfg=None),
157
+ class_text_path='data/texts/lvis_v1_class_texts.json',
158
+ pipeline=test_pipeline)
159
+ val_dataloader = dict(dataset=coco_val_dataset)
160
+ test_dataloader = val_dataloader
161
+
162
+ val_evaluator = dict(type='mmdet.LVISMetric',
163
+ ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json',
164
+ metric='bbox')
165
+ test_evaluator = val_evaluator
166
+
167
+ # training settings
168
+ default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs),
169
+ checkpoint=dict(interval=save_epoch_intervals,
170
+ rule='greater'))
171
+ custom_hooks = [
172
+ dict(type='EMAHook',
173
+ ema_type='ExpMomentumEMA',
174
+ momentum=0.0001,
175
+ update_buffers=True,
176
+ strict_load=False,
177
+ priority=49),
178
+ dict(type='mmdet.PipelineSwitchHook',
179
+ switch_epoch=max_epochs - close_mosaic_epochs,
180
+ switch_pipeline=train_pipeline_stage2)
181
+ ]
182
+ train_cfg = dict(max_epochs=max_epochs,
183
+ val_interval=10,
184
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
185
+ _base_.val_interval_stage2)])
186
+ optim_wrapper = dict(optimizer=dict(
187
+ _delete_=True,
188
+ type='AdamW',
189
+ lr=base_lr,
190
+ weight_decay=weight_decay,
191
+ batch_size_per_gpu=train_batch_size_per_gpu),
192
+ paramwise_cfg=dict(bias_decay_mult=0.0,
193
+ norm_decay_mult=0.0,
194
+ custom_keys={
195
+ 'backbone.text_model':
196
+ dict(lr_mult=0.01),
197
+ 'logit_scale':
198
+ dict(weight_decay=0.0)
199
+ }),
200
+ constructor='YOLOWv5OptimizerConstructor')
YOLO-World/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_l_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'],
4
+ allow_failed_imports=False)
5
+
6
+ # hyper-parameters
7
+ num_classes = 1203
8
+ num_training_classes = 80
9
+ max_epochs = 100 # Maximum training epochs
10
+ close_mosaic_epochs = 2
11
+ save_epoch_intervals = 2
12
+ text_channels = 768
13
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
14
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
15
+ base_lr = 2e-3
16
+ weight_decay = 0.0125
17
+ train_batch_size_per_gpu = 16
18
+ # text_model_name = '../pretrained_models/clip-vit-large-patch14-336'
19
+ text_model_name = 'openai/clip-vit-large-patch14-336'
20
+ # model settings
21
+ model = dict(
22
+ type='YOLOWorldDetector',
23
+ mm_neck=True,
24
+ num_train_classes=num_training_classes,
25
+ num_test_classes=num_classes,
26
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
27
+ backbone=dict(
28
+ _delete_=True,
29
+ type='MultiModalYOLOBackbone',
30
+ image_model={{_base_.model.backbone}},
31
+ text_model=dict(
32
+ type='HuggingCLIPLanguageBackbone',
33
+ model_name=text_model_name,
34
+ frozen_modules=['all'])),
35
+ neck=dict(type='YOLOWorldPAFPN',
36
+ guide_channels=text_channels,
37
+ embed_channels=neck_embed_channels,
38
+ num_heads=neck_num_heads,
39
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
40
+ bbox_head=dict(type='YOLOWorldHead',
41
+ head_module=dict(type='YOLOWorldHeadModule',
42
+ use_bn_head=True,
43
+ embed_dims=text_channels,
44
+ num_classes=num_training_classes)),
45
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
46
+
47
+ # dataset settings
48
+ text_transform = [
49
+ dict(type='RandomLoadText',
50
+ num_neg_samples=(num_classes, num_classes),
51
+ max_num_samples=num_training_classes,
52
+ padding_to_max=True,
53
+ padding_value=''),
54
+ dict(type='mmdet.PackDetInputs',
55
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
56
+ 'flip_direction', 'texts'))
57
+ ]
58
+ train_pipeline = [
59
+ *_base_.pre_transform,
60
+ dict(type='MultiModalMosaic',
61
+ img_scale=_base_.img_scale,
62
+ pad_val=114.0,
63
+ pre_transform=_base_.pre_transform),
64
+ dict(
65
+ type='YOLOv5RandomAffine',
66
+ max_rotate_degree=0.0,
67
+ max_shear_degree=0.0,
68
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
69
+ max_aspect_ratio=_base_.max_aspect_ratio,
70
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
71
+ border_val=(114, 114, 114)),
72
+ *_base_.last_transform[:-1],
73
+ *text_transform,
74
+ ]
75
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
76
+ obj365v1_train_dataset = dict(
77
+ type='MultiModalDataset',
78
+ dataset=dict(
79
+ type='YOLOv5Objects365V1Dataset',
80
+ data_root='data/objects365v1/',
81
+ ann_file='annotations/objects365_train.json',
82
+ data_prefix=dict(img='train/'),
83
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
84
+ class_text_path='data/texts/obj365v1_class_texts.json',
85
+ pipeline=train_pipeline)
86
+
87
+ mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset',
88
+ data_root='data/mixed_grounding/',
89
+ ann_file='annotations/final_mixed_train_no_coco.json',
90
+ data_prefix=dict(img='gqa/images/'),
91
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
92
+ pipeline=train_pipeline)
93
+
94
+ flickr_train_dataset = dict(
95
+ type='YOLOv5MixedGroundingDataset',
96
+ data_root='data/flickr/',
97
+ ann_file='annotations/final_flickr_separateGT_train.json',
98
+ data_prefix=dict(img='full_images/'),
99
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
100
+ pipeline=train_pipeline)
101
+
102
+ train_dataloader = dict(batch_size=train_batch_size_per_gpu,
103
+ collate_fn=dict(type='yolow_collate'),
104
+ dataset=dict(_delete_=True,
105
+ type='ConcatDataset',
106
+ datasets=[
107
+ obj365v1_train_dataset,
108
+ flickr_train_dataset, mg_train_dataset
109
+ ],
110
+ ignore_keys=['classes', 'palette']))
111
+
112
+ test_pipeline = [
113
+ *_base_.test_pipeline[:-1],
114
+ dict(type='LoadText'),
115
+ dict(type='mmdet.PackDetInputs',
116
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
117
+ 'scale_factor', 'pad_param', 'texts'))
118
+ ]
119
+ coco_val_dataset = dict(
120
+ _delete_=True,
121
+ type='MultiModalDataset',
122
+ dataset=dict(type='YOLOv5LVISV1Dataset',
123
+ data_root='data/coco/',
124
+ test_mode=True,
125
+ ann_file='lvis/lvis_v1_minival_inserted_image_name.json',
126
+ data_prefix=dict(img=''),
127
+ batch_shapes_cfg=None),
128
+ class_text_path='data/texts/lvis_v1_class_texts.json',
129
+ pipeline=test_pipeline)
130
+ val_dataloader = dict(dataset=coco_val_dataset)
131
+ test_dataloader = val_dataloader
132
+
133
+ val_evaluator = dict(type='mmdet.LVISMetric',
134
+ ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json',
135
+ metric='bbox')
136
+ test_evaluator = val_evaluator
137
+
138
+ # training settings
139
+ default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs),
140
+ checkpoint=dict(interval=save_epoch_intervals,
141
+ rule='greater'))
142
+ custom_hooks = [
143
+ dict(type='EMAHook',
144
+ ema_type='ExpMomentumEMA',
145
+ momentum=0.0001,
146
+ update_buffers=True,
147
+ strict_load=False,
148
+ priority=49),
149
+ dict(type='mmdet.PipelineSwitchHook',
150
+ switch_epoch=max_epochs - close_mosaic_epochs,
151
+ switch_pipeline=train_pipeline_stage2)
152
+ ]
153
+ train_cfg = dict(max_epochs=max_epochs,
154
+ val_interval=10,
155
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
156
+ _base_.val_interval_stage2)])
157
+ optim_wrapper = dict(optimizer=dict(
158
+ _delete_=True,
159
+ type='AdamW',
160
+ lr=base_lr,
161
+ weight_decay=weight_decay,
162
+ batch_size_per_gpu=train_batch_size_per_gpu),
163
+ paramwise_cfg=dict(bias_decay_mult=0.0,
164
+ norm_decay_mult=0.0,
165
+ custom_keys={
166
+ 'backbone.text_model':
167
+ dict(lr_mult=0.01),
168
+ 'logit_scale':
169
+ dict(weight_decay=0.0)
170
+ }),
171
+ constructor='YOLOWv5OptimizerConstructor')
YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_l_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'],
4
+ allow_failed_imports=False)
5
+
6
+ # hyper-parameters
7
+ num_classes = 1203
8
+ num_training_classes = 80
9
+ max_epochs = 20 # Maximum training epochs
10
+ close_mosaic_epochs = 2
11
+ save_epoch_intervals = 2
12
+ text_channels = 512
13
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
14
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
15
+ base_lr = 2e-4
16
+ weight_decay = 0.025
17
+ train_batch_size_per_gpu = 4
18
+ load_from = "pretrained_models/yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth"
19
+ # text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
20
+ text_model_name = 'openai/clip-vit-base-patch32'
21
+ img_scale = (1280, 1280)
22
+
23
+ # model settings
24
+ model = dict(
25
+ type='YOLOWorldDetector',
26
+ mm_neck=True,
27
+ num_train_classes=num_training_classes,
28
+ num_test_classes=num_classes,
29
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
30
+ backbone=dict(
31
+ _delete_=True,
32
+ type='MultiModalYOLOBackbone',
33
+ image_model={{_base_.model.backbone}},
34
+ text_model=dict(
35
+ type='HuggingCLIPLanguageBackbone',
36
+ model_name=text_model_name,
37
+ frozen_modules=['all'])),
38
+ neck=dict(type='YOLOWorldPAFPN',
39
+ guide_channels=text_channels,
40
+ embed_channels=neck_embed_channels,
41
+ num_heads=neck_num_heads,
42
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
43
+ bbox_head=dict(type='YOLOWorldHead',
44
+ head_module=dict(type='YOLOWorldHeadModule',
45
+ use_bn_head=True,
46
+ embed_dims=text_channels,
47
+ num_classes=num_training_classes)),
48
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
49
+
50
+ # dataset settings
51
+ text_transform = [
52
+ dict(type='RandomLoadText',
53
+ num_neg_samples=(num_classes, num_classes),
54
+ max_num_samples=num_training_classes,
55
+ padding_to_max=True,
56
+ padding_value=''),
57
+ dict(type='mmdet.PackDetInputs',
58
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
59
+ 'flip_direction', 'texts'))
60
+ ]
61
+ train_pipeline = [
62
+ *_base_.pre_transform,
63
+ dict(type='MultiModalMosaic',
64
+ img_scale=img_scale,
65
+ pad_val=114.0,
66
+ pre_transform=_base_.pre_transform),
67
+ dict(
68
+ type='YOLOv5RandomAffine',
69
+ max_rotate_degree=0.0,
70
+ max_shear_degree=0.0,
71
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
72
+ max_aspect_ratio=_base_.max_aspect_ratio,
73
+ border=(-img_scale[0] // 2, -img_scale[1] // 2),
74
+ border_val=(114, 114, 114)),
75
+ *_base_.last_transform[:-1],
76
+ *text_transform,
77
+ ]
78
+
79
+ train_pipeline_stage2 = [
80
+ *_base_.pre_transform,
81
+ dict(type='YOLOv5KeepRatioResize', scale=img_scale),
82
+ dict(
83
+ type='LetterResize',
84
+ scale=img_scale,
85
+ allow_scale_up=True,
86
+ pad_val=dict(img=114.0)),
87
+ dict(
88
+ type='YOLOv5RandomAffine',
89
+ max_rotate_degree=0.0,
90
+ max_shear_degree=0.0,
91
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
92
+ max_aspect_ratio=_base_.max_aspect_ratio,
93
+ border_val=(114, 114, 114)),
94
+ *_base_.last_transform[:-1],
95
+ *text_transform
96
+ ]
97
+
98
+ obj365v1_train_dataset = dict(
99
+ type='MultiModalDataset',
100
+ dataset=dict(
101
+ type='YOLOv5Objects365V1Dataset',
102
+ data_root='data/objects365v1/',
103
+ ann_file='annotations/objects365_train.json',
104
+ data_prefix=dict(img='train/'),
105
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
106
+ class_text_path='data/texts/obj365v1_class_texts.json',
107
+ pipeline=train_pipeline)
108
+
109
+ mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset',
110
+ data_root='data/mixed_grounding/',
111
+ ann_file='annotations/final_mixed_train_no_coco.json',
112
+ data_prefix=dict(img='gqa/images/'),
113
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
114
+ pipeline=train_pipeline)
115
+
116
+ flickr_train_dataset = dict(
117
+ type='YOLOv5MixedGroundingDataset',
118
+ data_root='data/flickr/',
119
+ ann_file='annotations/final_flickr_separateGT_train.json',
120
+ data_prefix=dict(img='full_images/'),
121
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
122
+ pipeline=train_pipeline)
123
+
124
+ train_dataloader = dict(batch_size=train_batch_size_per_gpu,
125
+ collate_fn=dict(type='yolow_collate'),
126
+ dataset=dict(_delete_=True,
127
+ type='ConcatDataset',
128
+ datasets=[
129
+ obj365v1_train_dataset,
130
+ flickr_train_dataset, mg_train_dataset
131
+ ],
132
+ ignore_keys=['classes', 'palette']))
133
+
134
+ test_pipeline = [
135
+ dict(type='LoadImageFromFile'),
136
+ dict(type='YOLOv5KeepRatioResize', scale=img_scale),
137
+ dict(
138
+ type='LetterResize',
139
+ scale=img_scale,
140
+ allow_scale_up=False,
141
+ pad_val=dict(img=114)),
142
+ dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
143
+ dict(type='LoadText'),
144
+ dict(type='mmdet.PackDetInputs',
145
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
146
+ 'scale_factor', 'pad_param', 'texts'))
147
+ ]
148
+
149
+ coco_val_dataset = dict(
150
+ _delete_=True,
151
+ type='MultiModalDataset',
152
+ dataset=dict(type='YOLOv5LVISV1Dataset',
153
+ data_root='data/coco/',
154
+ test_mode=True,
155
+ ann_file='lvis/lvis_v1_minival_inserted_image_name.json',
156
+ data_prefix=dict(img=''),
157
+ batch_shapes_cfg=None),
158
+ class_text_path='data/texts/lvis_v1_class_texts.json',
159
+ pipeline=test_pipeline)
160
+ val_dataloader = dict(dataset=coco_val_dataset)
161
+ test_dataloader = val_dataloader
162
+
163
+ val_evaluator = dict(type='mmdet.LVISMetric',
164
+ ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json',
165
+ metric='bbox')
166
+ test_evaluator = val_evaluator
167
+
168
+ # training settings
169
+ default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs),
170
+ checkpoint=dict(interval=save_epoch_intervals,
171
+ rule='greater'))
172
+ custom_hooks = [
173
+ dict(type='EMAHook',
174
+ ema_type='ExpMomentumEMA',
175
+ momentum=0.0001,
176
+ update_buffers=True,
177
+ strict_load=False,
178
+ priority=49),
179
+ dict(type='mmdet.PipelineSwitchHook',
180
+ switch_epoch=max_epochs - close_mosaic_epochs,
181
+ switch_pipeline=train_pipeline_stage2)
182
+ ]
183
+ train_cfg = dict(max_epochs=max_epochs,
184
+ val_interval=10,
185
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
186
+ _base_.val_interval_stage2)])
187
+
188
+ optim_wrapper = dict(optimizer=dict(
189
+ _delete_=True,
190
+ type='AdamW',
191
+ lr=base_lr,
192
+ weight_decay=weight_decay,
193
+ batch_size_per_gpu=train_batch_size_per_gpu),
194
+ paramwise_cfg=dict(bias_decay_mult=0.0,
195
+ norm_decay_mult=0.0,
196
+ custom_keys={
197
+ 'backbone.text_model':
198
+ dict(lr_mult=0.01),
199
+ 'logit_scale':
200
+ dict(weight_decay=0.0)
201
+ }),
202
+ constructor='YOLOWv5OptimizerConstructor')
YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_l_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'],
4
+ allow_failed_imports=False)
5
+
6
+ # hyper-parameters
7
+ num_classes = 1203
8
+ num_training_classes = 80
9
+ max_epochs = 100 # Maximum training epochs
10
+ close_mosaic_epochs = 2
11
+ save_epoch_intervals = 2
12
+ text_channels = 512
13
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
14
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
15
+ base_lr = 2e-3
16
+ weight_decay = 0.05 / 2
17
+ train_batch_size_per_gpu = 16
18
+ # text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
19
+ text_model_name = 'openai/clip-vit-base-patch32'
20
+ # model settings
21
+ model = dict(
22
+ type='YOLOWorldDetector',
23
+ mm_neck=True,
24
+ num_train_classes=num_training_classes,
25
+ num_test_classes=num_classes,
26
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
27
+ backbone=dict(
28
+ _delete_=True,
29
+ type='MultiModalYOLOBackbone',
30
+ image_model={{_base_.model.backbone}},
31
+ text_model=dict(
32
+ type='HuggingCLIPLanguageBackbone',
33
+ model_name=text_model_name,
34
+ frozen_modules=['all'])),
35
+ neck=dict(type='YOLOWorldPAFPN',
36
+ guide_channels=text_channels,
37
+ embed_channels=neck_embed_channels,
38
+ num_heads=neck_num_heads,
39
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
40
+ bbox_head=dict(type='YOLOWorldHead',
41
+ head_module=dict(type='YOLOWorldHeadModule',
42
+ use_bn_head=True,
43
+ embed_dims=text_channels,
44
+ num_classes=num_training_classes)),
45
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
46
+
47
+ # dataset settings
48
+ text_transform = [
49
+ dict(type='RandomLoadText',
50
+ num_neg_samples=(num_classes, num_classes),
51
+ max_num_samples=num_training_classes,
52
+ padding_to_max=True,
53
+ padding_value=''),
54
+ dict(type='mmdet.PackDetInputs',
55
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
56
+ 'flip_direction', 'texts'))
57
+ ]
58
+ train_pipeline = [
59
+ *_base_.pre_transform,
60
+ dict(type='MultiModalMosaic',
61
+ img_scale=_base_.img_scale,
62
+ pad_val=114.0,
63
+ pre_transform=_base_.pre_transform),
64
+ dict(
65
+ type='YOLOv5RandomAffine',
66
+ max_rotate_degree=0.0,
67
+ max_shear_degree=0.0,
68
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
69
+ max_aspect_ratio=_base_.max_aspect_ratio,
70
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
71
+ border_val=(114, 114, 114)),
72
+ *_base_.last_transform[:-1],
73
+ *text_transform,
74
+ ]
75
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
76
+ obj365v1_train_dataset = dict(
77
+ type='MultiModalDataset',
78
+ dataset=dict(
79
+ type='YOLOv5Objects365V1Dataset',
80
+ data_root='data/objects365v1/',
81
+ ann_file='annotations/objects365_train.json',
82
+ data_prefix=dict(img='train/'),
83
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
84
+ class_text_path='data/texts/obj365v1_class_texts.json',
85
+ pipeline=train_pipeline)
86
+
87
+ mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset',
88
+ data_root='data/mixed_grounding/',
89
+ ann_file='annotations/final_mixed_train_no_coco.json',
90
+ data_prefix=dict(img='gqa/images/'),
91
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
92
+ pipeline=train_pipeline)
93
+
94
+ flickr_train_dataset = dict(
95
+ type='YOLOv5MixedGroundingDataset',
96
+ data_root='data/flickr/',
97
+ ann_file='annotations/final_flickr_separateGT_train.json',
98
+ data_prefix=dict(img='full_images/'),
99
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
100
+ pipeline=train_pipeline)
101
+
102
+ train_dataloader = dict(batch_size=train_batch_size_per_gpu,
103
+ collate_fn=dict(type='yolow_collate'),
104
+ dataset=dict(_delete_=True,
105
+ type='ConcatDataset',
106
+ datasets=[
107
+ obj365v1_train_dataset,
108
+ flickr_train_dataset, mg_train_dataset
109
+ ],
110
+ ignore_keys=['classes', 'palette']))
111
+
112
+ test_pipeline = [
113
+ *_base_.test_pipeline[:-1],
114
+ dict(type='LoadText'),
115
+ dict(type='mmdet.PackDetInputs',
116
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
117
+ 'scale_factor', 'pad_param', 'texts'))
118
+ ]
119
+ coco_val_dataset = dict(
120
+ _delete_=True,
121
+ type='MultiModalDataset',
122
+ dataset=dict(type='YOLOv5LVISV1Dataset',
123
+ data_root='data/coco/',
124
+ test_mode=True,
125
+ ann_file='lvis/lvis_v1_minival_inserted_image_name.json',
126
+ data_prefix=dict(img=''),
127
+ batch_shapes_cfg=None),
128
+ class_text_path='data/texts/lvis_v1_class_texts.json',
129
+ pipeline=test_pipeline)
130
+ val_dataloader = dict(dataset=coco_val_dataset)
131
+ test_dataloader = val_dataloader
132
+
133
+ val_evaluator = dict(type='mmdet.LVISMetric',
134
+ ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json',
135
+ metric='bbox')
136
+ test_evaluator = val_evaluator
137
+
138
+ # training settings
139
+ default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs),
140
+ checkpoint=dict(interval=save_epoch_intervals,
141
+ rule='greater'))
142
+ custom_hooks = [
143
+ dict(type='EMAHook',
144
+ ema_type='ExpMomentumEMA',
145
+ momentum=0.0001,
146
+ update_buffers=True,
147
+ strict_load=False,
148
+ priority=49),
149
+ dict(type='mmdet.PipelineSwitchHook',
150
+ switch_epoch=max_epochs - close_mosaic_epochs,
151
+ switch_pipeline=train_pipeline_stage2)
152
+ ]
153
+ train_cfg = dict(max_epochs=max_epochs,
154
+ val_interval=10,
155
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
156
+ _base_.val_interval_stage2)])
157
+ optim_wrapper = dict(optimizer=dict(
158
+ _delete_=True,
159
+ type='AdamW',
160
+ lr=base_lr,
161
+ weight_decay=weight_decay,
162
+ batch_size_per_gpu=train_batch_size_per_gpu),
163
+ paramwise_cfg=dict(bias_decay_mult=0.0,
164
+ norm_decay_mult=0.0,
165
+ custom_keys={
166
+ 'backbone.text_model':
167
+ dict(lr_mult=0.01),
168
+ 'logit_scale':
169
+ dict(weight_decay=0.0)
170
+ }),
171
+ constructor='YOLOWv5OptimizerConstructor')
YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_l_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'],
4
+ allow_failed_imports=False)
5
+
6
+ # hyper-parameters
7
+ num_classes = 1203
8
+ num_training_classes = 80
9
+ max_epochs = 100 # Maximum training epochs
10
+ close_mosaic_epochs = 2
11
+ save_epoch_intervals = 2
12
+ text_channels = 512
13
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
14
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
15
+ base_lr = 2e-3
16
+ weight_decay = 0.05 / 2
17
+ train_batch_size_per_gpu = 16
18
+ # text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
19
+ text_model_name = 'openai/clip-vit-base-patch32'
20
+ # model settings
21
+ model = dict(
22
+ type='YOLOWorldDetector',
23
+ mm_neck=True,
24
+ num_train_classes=num_training_classes,
25
+ num_test_classes=num_classes,
26
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
27
+ backbone=dict(
28
+ _delete_=True,
29
+ type='MultiModalYOLOBackbone',
30
+ image_model={{_base_.model.backbone}},
31
+ text_model=dict(
32
+ type='HuggingCLIPLanguageBackbone',
33
+ model_name=text_model_name,
34
+ frozen_modules=['all'])),
35
+ neck=dict(type='YOLOWorldPAFPN',
36
+ guide_channels=text_channels,
37
+ embed_channels=neck_embed_channels,
38
+ num_heads=neck_num_heads,
39
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
40
+ bbox_head=dict(type='YOLOWorldHead',
41
+ head_module=dict(type='YOLOWorldHeadModule',
42
+ use_bn_head=True,
43
+ embed_dims=text_channels,
44
+ num_classes=num_training_classes)),
45
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
46
+
47
+ # dataset settings
48
+ text_transform = [
49
+ dict(type='RandomLoadText',
50
+ num_neg_samples=(num_classes, num_classes),
51
+ max_num_samples=num_training_classes,
52
+ padding_to_max=True,
53
+ padding_value=''),
54
+ dict(type='mmdet.PackDetInputs',
55
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
56
+ 'flip_direction', 'texts'))
57
+ ]
58
+ train_pipeline = [
59
+ *_base_.pre_transform,
60
+ dict(type='MultiModalMosaic',
61
+ img_scale=_base_.img_scale,
62
+ pad_val=114.0,
63
+ pre_transform=_base_.pre_transform),
64
+ dict(
65
+ type='YOLOv5RandomAffine',
66
+ max_rotate_degree=0.0,
67
+ max_shear_degree=0.0,
68
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
69
+ max_aspect_ratio=_base_.max_aspect_ratio,
70
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
71
+ border_val=(114, 114, 114)),
72
+ *_base_.last_transform[:-1],
73
+ *text_transform,
74
+ ]
75
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
76
+ obj365v1_train_dataset = dict(
77
+ type='MultiModalDataset',
78
+ dataset=dict(
79
+ type='YOLOv5Objects365V1Dataset',
80
+ data_root='data/objects365v1/',
81
+ ann_file='annotations/objects365_train.json',
82
+ data_prefix=dict(img='train/'),
83
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
84
+ class_text_path='data/texts/obj365v1_class_texts.json',
85
+ pipeline=train_pipeline)
86
+
87
+ mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset',
88
+ data_root='data/mixed_grounding/',
89
+ ann_file='annotations/final_mixed_train_no_coco.json',
90
+ data_prefix=dict(img='gqa/images/'),
91
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
92
+ pipeline=train_pipeline)
93
+
94
+ flickr_train_dataset = dict(
95
+ type='YOLOv5MixedGroundingDataset',
96
+ data_root='data/flickr/',
97
+ ann_file='annotations/final_flickr_separateGT_train.json',
98
+ data_prefix=dict(img='full_images/'),
99
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
100
+ pipeline=train_pipeline)
101
+
102
+ train_dataloader = dict(batch_size=train_batch_size_per_gpu,
103
+ collate_fn=dict(type='yolow_collate'),
104
+ dataset=dict(_delete_=True,
105
+ type='ConcatDataset',
106
+ datasets=[
107
+ obj365v1_train_dataset,
108
+ flickr_train_dataset, mg_train_dataset
109
+ ],
110
+ ignore_keys=['classes', 'palette']))
111
+
112
+ test_pipeline = [
113
+ *_base_.test_pipeline[:-1],
114
+ dict(type='LoadText'),
115
+ dict(type='mmdet.PackDetInputs',
116
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
117
+ 'scale_factor', 'pad_param', 'texts'))
118
+ ]
119
+ coco_val_dataset = dict(
120
+ _delete_=True,
121
+ type='MultiModalDataset',
122
+ dataset=dict(type='YOLOv5LVISV1Dataset',
123
+ data_root='data/coco/',
124
+ test_mode=True,
125
+ ann_file='lvis/lvis_v1_val.json',
126
+ data_prefix=dict(img=''),
127
+ batch_shapes_cfg=None),
128
+ class_text_path='data/texts/lvis_v1_class_texts.json',
129
+ pipeline=test_pipeline)
130
+ val_dataloader = dict(dataset=coco_val_dataset)
131
+ test_dataloader = val_dataloader
132
+
133
+ val_evaluator = dict(type='mmdet.LVISMetric',
134
+ ann_file='data/coco/lvis/lvis_v1_val.json',
135
+ metric='bbox')
136
+ test_evaluator = val_evaluator
137
+
138
+ # training settings
139
+ default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs),
140
+ checkpoint=dict(interval=save_epoch_intervals,
141
+ rule='greater'))
142
+ custom_hooks = [
143
+ dict(type='EMAHook',
144
+ ema_type='ExpMomentumEMA',
145
+ momentum=0.0001,
146
+ update_buffers=True,
147
+ strict_load=False,
148
+ priority=49),
149
+ dict(type='mmdet.PipelineSwitchHook',
150
+ switch_epoch=max_epochs - close_mosaic_epochs,
151
+ switch_pipeline=train_pipeline_stage2)
152
+ ]
153
+ train_cfg = dict(max_epochs=max_epochs,
154
+ val_interval=10,
155
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
156
+ _base_.val_interval_stage2)])
157
+ optim_wrapper = dict(optimizer=dict(
158
+ _delete_=True,
159
+ type='AdamW',
160
+ lr=base_lr,
161
+ weight_decay=weight_decay,
162
+ batch_size_per_gpu=train_batch_size_per_gpu),
163
+ paramwise_cfg=dict(bias_decay_mult=0.0,
164
+ norm_decay_mult=0.0,
165
+ custom_keys={
166
+ 'backbone.text_model':
167
+ dict(lr_mult=0.01),
168
+ 'logit_scale':
169
+ dict(weight_decay=0.0)
170
+ }),
171
+ constructor='YOLOWv5OptimizerConstructor')
YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_m_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'],
4
+ allow_failed_imports=False)
5
+
6
+ # hyper-parameters
7
+ num_classes = 1203
8
+ num_training_classes = 80
9
+ max_epochs = 100 # Maximum training epochs
10
+ close_mosaic_epochs = 2
11
+ save_epoch_intervals = 2
12
+ text_channels = 512
13
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
14
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
15
+ base_lr = 2e-3
16
+ weight_decay = 0.05 / 2
17
+ train_batch_size_per_gpu = 16
18
+ text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
19
+ img_scale = (1280, 1280)
20
+
21
+ text_model_name = 'openai/clip-vit-base-patch32'
22
+ # model settings
23
+ model = dict(
24
+ type='YOLOWorldDetector',
25
+ mm_neck=True,
26
+ num_train_classes=num_training_classes,
27
+ num_test_classes=num_classes,
28
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
29
+ backbone=dict(
30
+ _delete_=True,
31
+ type='MultiModalYOLOBackbone',
32
+ image_model={{_base_.model.backbone}},
33
+ text_model=dict(
34
+ type='HuggingCLIPLanguageBackbone',
35
+ model_name=text_model_name,
36
+ frozen_modules=['all'])),
37
+ neck=dict(type='YOLOWorldPAFPN',
38
+ guide_channels=text_channels,
39
+ embed_channels=neck_embed_channels,
40
+ num_heads=neck_num_heads,
41
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
42
+ bbox_head=dict(type='YOLOWorldHead',
43
+ head_module=dict(type='YOLOWorldHeadModule',
44
+ use_bn_head=True,
45
+ embed_dims=text_channels,
46
+ num_classes=num_training_classes)),
47
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
48
+
49
+ # dataset settings
50
+ text_transform = [
51
+ dict(type='RandomLoadText',
52
+ num_neg_samples=(num_classes, num_classes),
53
+ max_num_samples=num_training_classes,
54
+ padding_to_max=True,
55
+ padding_value=''),
56
+ dict(type='mmdet.PackDetInputs',
57
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
58
+ 'flip_direction', 'texts'))
59
+ ]
60
+ train_pipeline = [
61
+ *_base_.pre_transform,
62
+ dict(type='MultiModalMosaic',
63
+ img_scale=img_scale,
64
+ pad_val=114.0,
65
+ pre_transform=_base_.pre_transform),
66
+ dict(
67
+ type='YOLOv5RandomAffine',
68
+ max_rotate_degree=0.0,
69
+ max_shear_degree=0.0,
70
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
71
+ max_aspect_ratio=_base_.max_aspect_ratio,
72
+ border=(-img_scale[0] // 2, -img_scale[1] // 2),
73
+ border_val=(114, 114, 114)),
74
+ *_base_.last_transform[:-1],
75
+ *text_transform,
76
+ ]
77
+
78
+ train_pipeline_stage2 = [
79
+ *_base_.pre_transform,
80
+ dict(type='YOLOv5KeepRatioResize', scale=img_scale),
81
+ dict(
82
+ type='LetterResize',
83
+ scale=img_scale,
84
+ allow_scale_up=True,
85
+ pad_val=dict(img=114.0)),
86
+ dict(
87
+ type='YOLOv5RandomAffine',
88
+ max_rotate_degree=0.0,
89
+ max_shear_degree=0.0,
90
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
91
+ max_aspect_ratio=_base_.max_aspect_ratio,
92
+ border_val=(114, 114, 114)),
93
+ *_base_.last_transform[:-1],
94
+ *text_transform
95
+ ]
96
+ obj365v1_train_dataset = dict(
97
+ type='MultiModalDataset',
98
+ dataset=dict(
99
+ type='YOLOv5Objects365V1Dataset',
100
+ data_root='data/objects365v1/',
101
+ ann_file='annotations/objects365_train.json',
102
+ data_prefix=dict(img='train/'),
103
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
104
+ class_text_path='data/texts/obj365v1_class_texts.json',
105
+ pipeline=train_pipeline)
106
+
107
+ mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset',
108
+ data_root='data/mixed_grounding/',
109
+ ann_file='annotations/final_mixed_train_no_coco.json',
110
+ data_prefix=dict(img='gqa/images/'),
111
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
112
+ pipeline=train_pipeline)
113
+
114
+ flickr_train_dataset = dict(
115
+ type='YOLOv5MixedGroundingDataset',
116
+ data_root='data/flickr/',
117
+ ann_file='annotations/final_flickr_separateGT_train.json',
118
+ data_prefix=dict(img='full_images/'),
119
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
120
+ pipeline=train_pipeline)
121
+
122
+ train_dataloader = dict(batch_size=train_batch_size_per_gpu,
123
+ collate_fn=dict(type='yolow_collate'),
124
+ dataset=dict(_delete_=True,
125
+ type='ConcatDataset',
126
+ datasets=[
127
+ obj365v1_train_dataset,
128
+ flickr_train_dataset, mg_train_dataset
129
+ ],
130
+ ignore_keys=['classes', 'palette']))
131
+
132
+ test_pipeline = [
133
+ dict(type='LoadImageFromFile'),
134
+ dict(type='YOLOv5KeepRatioResize', scale=img_scale),
135
+ dict(
136
+ type='LetterResize',
137
+ scale=img_scale,
138
+ allow_scale_up=False,
139
+ pad_val=dict(img=114)),
140
+ dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
141
+ dict(type='LoadText'),
142
+ dict(type='mmdet.PackDetInputs',
143
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
144
+ 'scale_factor', 'pad_param', 'texts'))
145
+ ]
146
+ coco_val_dataset = dict(
147
+ _delete_=True,
148
+ type='MultiModalDataset',
149
+ dataset=dict(type='YOLOv5LVISV1Dataset',
150
+ data_root='data/coco/',
151
+ test_mode=True,
152
+ ann_file='lvis/lvis_v1_minival_inserted_image_name.json',
153
+ data_prefix=dict(img=''),
154
+ batch_shapes_cfg=None),
155
+ class_text_path='data/texts/lvis_v1_class_texts.json',
156
+ pipeline=test_pipeline)
157
+ val_dataloader = dict(dataset=coco_val_dataset)
158
+ test_dataloader = val_dataloader
159
+
160
+ val_evaluator = dict(type='mmdet.LVISMetric',
161
+ ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json',
162
+ metric='bbox')
163
+ test_evaluator = val_evaluator
164
+
165
+ # training settings
166
+ default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs),
167
+ checkpoint=dict(interval=save_epoch_intervals,
168
+ rule='greater'))
169
+ custom_hooks = [
170
+ dict(type='EMAHook',
171
+ ema_type='ExpMomentumEMA',
172
+ momentum=0.0001,
173
+ update_buffers=True,
174
+ strict_load=False,
175
+ priority=49),
176
+ dict(type='mmdet.PipelineSwitchHook',
177
+ switch_epoch=max_epochs - close_mosaic_epochs,
178
+ switch_pipeline=train_pipeline_stage2)
179
+ ]
180
+ train_cfg = dict(max_epochs=max_epochs,
181
+ val_interval=10,
182
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
183
+ _base_.val_interval_stage2)])
184
+ optim_wrapper = dict(optimizer=dict(
185
+ _delete_=True,
186
+ type='AdamW',
187
+ lr=base_lr,
188
+ weight_decay=weight_decay,
189
+ batch_size_per_gpu=train_batch_size_per_gpu),
190
+ paramwise_cfg=dict(bias_decay_mult=0.0,
191
+ norm_decay_mult=0.0,
192
+ custom_keys={
193
+ 'backbone.text_model':
194
+ dict(lr_mult=0.01),
195
+ 'logit_scale':
196
+ dict(weight_decay=0.0)
197
+ }),
198
+ constructor='YOLOWv5OptimizerConstructor')
YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = ('../../third_party/mmyolo/configs/yolov8/'
2
+ 'yolov8_m_syncbn_fast_8xb16-500e_coco.py')
3
+ custom_imports = dict(imports=['yolo_world'],
4
+ allow_failed_imports=False)
5
+
6
+ # hyper-parameters
7
+ num_classes = 1203
8
+ num_training_classes = 80
9
+ max_epochs = 100 # Maximum training epochs
10
+ close_mosaic_epochs = 2
11
+ save_epoch_intervals = 2
12
+ text_channels = 512
13
+ neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
14
+ neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
15
+ base_lr = 2e-3
16
+ weight_decay = 0.05 / 2
17
+ train_batch_size_per_gpu = 16
18
+ text_model_name = 'openai/clip-vit-large-patch14-336'
19
+ text_model_name = 'openai/clip-vit-base-patch32'
20
+ # model settings
21
+ model = dict(
22
+ type='YOLOWorldDetector',
23
+ mm_neck=True,
24
+ num_train_classes=num_training_classes,
25
+ num_test_classes=num_classes,
26
+ data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
27
+ backbone=dict(
28
+ _delete_=True,
29
+ type='MultiModalYOLOBackbone',
30
+ image_model={{_base_.model.backbone}},
31
+ text_model=dict(
32
+ type='HuggingCLIPLanguageBackbone',
33
+ model_name=text_model_name,
34
+ frozen_modules=['all'])),
35
+ neck=dict(type='YOLOWorldPAFPN',
36
+ guide_channels=text_channels,
37
+ embed_channels=neck_embed_channels,
38
+ num_heads=neck_num_heads,
39
+ block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
40
+ bbox_head=dict(type='YOLOWorldHead',
41
+ head_module=dict(type='YOLOWorldHeadModule',
42
+ use_bn_head=True,
43
+ embed_dims=text_channels,
44
+ num_classes=num_training_classes)),
45
+ train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
46
+
47
+ # dataset settings
48
+ text_transform = [
49
+ dict(type='RandomLoadText',
50
+ num_neg_samples=(num_classes, num_classes),
51
+ max_num_samples=num_training_classes,
52
+ padding_to_max=True,
53
+ padding_value=''),
54
+ dict(type='mmdet.PackDetInputs',
55
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
56
+ 'flip_direction', 'texts'))
57
+ ]
58
+ train_pipeline = [
59
+ *_base_.pre_transform,
60
+ dict(type='MultiModalMosaic',
61
+ img_scale=_base_.img_scale,
62
+ pad_val=114.0,
63
+ pre_transform=_base_.pre_transform),
64
+ dict(
65
+ type='YOLOv5RandomAffine',
66
+ max_rotate_degree=0.0,
67
+ max_shear_degree=0.0,
68
+ scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
69
+ max_aspect_ratio=_base_.max_aspect_ratio,
70
+ border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
71
+ border_val=(114, 114, 114)),
72
+ *_base_.last_transform[:-1],
73
+ *text_transform,
74
+ ]
75
+ train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform]
76
+ obj365v1_train_dataset = dict(
77
+ type='MultiModalDataset',
78
+ dataset=dict(
79
+ type='YOLOv5Objects365V1Dataset',
80
+ data_root='data/objects365v1/',
81
+ ann_file='annotations/objects365_train.json',
82
+ data_prefix=dict(img='train/'),
83
+ filter_cfg=dict(filter_empty_gt=False, min_size=32)),
84
+ class_text_path='data/texts/obj365v1_class_texts.json',
85
+ pipeline=train_pipeline)
86
+
87
+ mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset',
88
+ data_root='data/mixed_grounding/',
89
+ ann_file='annotations/final_mixed_train_no_coco.json',
90
+ data_prefix=dict(img='gqa/images/'),
91
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
92
+ pipeline=train_pipeline)
93
+
94
+ flickr_train_dataset = dict(
95
+ type='YOLOv5MixedGroundingDataset',
96
+ data_root='data/flickr/',
97
+ ann_file='annotations/final_flickr_separateGT_train.json',
98
+ data_prefix=dict(img='full_images/'),
99
+ filter_cfg=dict(filter_empty_gt=True, min_size=32),
100
+ pipeline=train_pipeline)
101
+
102
+ train_dataloader = dict(batch_size=train_batch_size_per_gpu,
103
+ collate_fn=dict(type='yolow_collate'),
104
+ dataset=dict(_delete_=True,
105
+ type='ConcatDataset',
106
+ datasets=[
107
+ obj365v1_train_dataset,
108
+ flickr_train_dataset, mg_train_dataset
109
+ ],
110
+ ignore_keys=['classes', 'palette']))
111
+
112
+ test_pipeline = [
113
+ *_base_.test_pipeline[:-1],
114
+ dict(type='LoadText'),
115
+ dict(type='mmdet.PackDetInputs',
116
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
117
+ 'scale_factor', 'pad_param', 'texts'))
118
+ ]
119
+ coco_val_dataset = dict(
120
+ _delete_=True,
121
+ type='MultiModalDataset',
122
+ dataset=dict(type='YOLOv5LVISV1Dataset',
123
+ data_root='data/coco/',
124
+ test_mode=True,
125
+ ann_file='lvis/lvis_v1_minival_inserted_image_name.json',
126
+ data_prefix=dict(img=''),
127
+ batch_shapes_cfg=None),
128
+ class_text_path='data/texts/lvis_v1_class_texts.json',
129
+ pipeline=test_pipeline)
130
+ val_dataloader = dict(dataset=coco_val_dataset)
131
+ test_dataloader = val_dataloader
132
+
133
+ val_evaluator = dict(type='mmdet.LVISMetric',
134
+ ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json',
135
+ metric='bbox')
136
+ test_evaluator = val_evaluator
137
+
138
+ # training settings
139
+ default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs),
140
+ checkpoint=dict(interval=save_epoch_intervals,
141
+ rule='greater'))
142
+ custom_hooks = [
143
+ dict(type='EMAHook',
144
+ ema_type='ExpMomentumEMA',
145
+ momentum=0.0001,
146
+ update_buffers=True,
147
+ strict_load=False,
148
+ priority=49),
149
+ dict(type='mmdet.PipelineSwitchHook',
150
+ switch_epoch=max_epochs - close_mosaic_epochs,
151
+ switch_pipeline=train_pipeline_stage2)
152
+ ]
153
+ train_cfg = dict(max_epochs=max_epochs,
154
+ val_interval=10,
155
+ dynamic_intervals=[((max_epochs - close_mosaic_epochs),
156
+ _base_.val_interval_stage2)])
157
+ optim_wrapper = dict(optimizer=dict(
158
+ _delete_=True,
159
+ type='AdamW',
160
+ lr=base_lr,
161
+ weight_decay=weight_decay,
162
+ batch_size_per_gpu=train_batch_size_per_gpu),
163
+ paramwise_cfg=dict(bias_decay_mult=0.0,
164
+ norm_decay_mult=0.0,
165
+ custom_keys={
166
+ 'backbone.text_model':
167
+ dict(lr_mult=0.01),
168
+ 'logit_scale':
169
+ dict(weight_decay=0.0)
170
+ }),
171
+ constructor='YOLOWv5OptimizerConstructor')