WatsonTang98
commited on
Upload 21 files
Browse files- handcrafted_solution.py +245 -0
- pc_util/.DS_Store +0 -0
- pc_util/setup.py +23 -0
- pc_util/src/ball_query.cpp +84 -0
- pc_util/src/ball_query_gpu.cu +270 -0
- pc_util/src/ball_query_gpu.h +38 -0
- pc_util/src/cluster.cpp +50 -0
- pc_util/src/cluster_gpu.cu +192 -0
- pc_util/src/cluster_gpu.h +34 -0
- pc_util/src/cuda_utils.h +15 -0
- pc_util/src/group_points.cpp +98 -0
- pc_util/src/group_points_gpu.cu +199 -0
- pc_util/src/group_points_gpu.h +36 -0
- pc_util/src/interpolate.cpp +148 -0
- pc_util/src/interpolate_gpu.cu +343 -0
- pc_util/src/interpolate_gpu.h +61 -0
- pc_util/src/pointnet2_api.cpp +41 -0
- pc_util/src/sampling.cpp +46 -0
- pc_util/src/sampling_gpu.cu +259 -0
- pc_util/src/sampling_gpu.h +29 -0
- script.py +297 -0
handcrafted_solution.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Description: This file contains the handcrafted solution for the task of wireframe reconstruction
|
2 |
+
|
3 |
+
import io
|
4 |
+
from PIL import Image as PImage
|
5 |
+
import numpy as np
|
6 |
+
from collections import defaultdict
|
7 |
+
import cv2
|
8 |
+
from typing import Tuple, List
|
9 |
+
from scipy.spatial.distance import cdist
|
10 |
+
|
11 |
+
from hoho.read_write_colmap import read_cameras_binary, read_images_binary, read_points3D_binary
|
12 |
+
from hoho.color_mappings import gestalt_color_mapping, ade20k_color_mapping
|
13 |
+
|
14 |
+
|
15 |
+
def empty_solution():
|
16 |
+
'''Return a minimal valid solution, i.e. 2 vertices and 1 edge.'''
|
17 |
+
return np.zeros((2,3)), [(0, 1)]
|
18 |
+
|
19 |
+
|
20 |
+
def convert_entry_to_human_readable(entry):
|
21 |
+
out = {}
|
22 |
+
already_good = ['__key__', 'wf_vertices', 'wf_edges', 'edge_semantics', 'mesh_vertices', 'mesh_faces', 'face_semantics', 'K', 'R', 't']
|
23 |
+
for k, v in entry.items():
|
24 |
+
if k in already_good:
|
25 |
+
out[k] = v
|
26 |
+
continue
|
27 |
+
if k == 'points3d':
|
28 |
+
out[k] = read_points3D_binary(fid=io.BytesIO(v))
|
29 |
+
if k == 'cameras':
|
30 |
+
out[k] = read_cameras_binary(fid=io.BytesIO(v))
|
31 |
+
if k == 'images':
|
32 |
+
out[k] = read_images_binary(fid=io.BytesIO(v))
|
33 |
+
if k in ['ade20k', 'gestalt']:
|
34 |
+
out[k] = [PImage.open(io.BytesIO(x)).convert('RGB') for x in v]
|
35 |
+
if k == 'depthcm':
|
36 |
+
out[k] = [PImage.open(io.BytesIO(x)) for x in entry['depthcm']]
|
37 |
+
return out
|
38 |
+
|
39 |
+
|
40 |
+
def get_vertices_and_edges_from_segmentation(gest_seg_np, edge_th = 50.0):
|
41 |
+
'''Get the vertices and edges from the gestalt segmentation mask of the house'''
|
42 |
+
vertices = []
|
43 |
+
connections = []
|
44 |
+
# Apex
|
45 |
+
apex_color = np.array(gestalt_color_mapping['apex'])
|
46 |
+
apex_mask = cv2.inRange(gest_seg_np, apex_color-0.5, apex_color+0.5)
|
47 |
+
if apex_mask.sum() > 0:
|
48 |
+
output = cv2.connectedComponentsWithStats(apex_mask, 8, cv2.CV_32S)
|
49 |
+
(numLabels, labels, stats, centroids) = output
|
50 |
+
stats, centroids = stats[1:], centroids[1:]
|
51 |
+
|
52 |
+
for i in range(numLabels-1):
|
53 |
+
vert = {"xy": centroids[i], "type": "apex"}
|
54 |
+
vertices.append(vert)
|
55 |
+
|
56 |
+
eave_end_color = np.array(gestalt_color_mapping['eave_end_point'])
|
57 |
+
eave_end_mask = cv2.inRange(gest_seg_np, eave_end_color-0.5, eave_end_color+0.5)
|
58 |
+
if eave_end_mask.sum() > 0:
|
59 |
+
output = cv2.connectedComponentsWithStats(eave_end_mask, 8, cv2.CV_32S)
|
60 |
+
(numLabels, labels, stats, centroids) = output
|
61 |
+
stats, centroids = stats[1:], centroids[1:]
|
62 |
+
|
63 |
+
for i in range(numLabels-1):
|
64 |
+
vert = {"xy": centroids[i], "type": "eave_end_point"}
|
65 |
+
vertices.append(vert)
|
66 |
+
# Connectivity
|
67 |
+
apex_pts = []
|
68 |
+
apex_pts_idxs = []
|
69 |
+
for j, v in enumerate(vertices):
|
70 |
+
apex_pts.append(v['xy'])
|
71 |
+
apex_pts_idxs.append(j)
|
72 |
+
apex_pts = np.array(apex_pts)
|
73 |
+
|
74 |
+
# Ridge connects two apex points
|
75 |
+
for edge_class in ['eave', 'ridge', 'rake', 'valley']:
|
76 |
+
edge_color = np.array(gestalt_color_mapping[edge_class])
|
77 |
+
mask = cv2.morphologyEx(cv2.inRange(gest_seg_np,
|
78 |
+
edge_color-0.5,
|
79 |
+
edge_color+0.5),
|
80 |
+
cv2.MORPH_DILATE, np.ones((11, 11)))
|
81 |
+
line_img = np.copy(gest_seg_np) * 0
|
82 |
+
if mask.sum() > 0:
|
83 |
+
output = cv2.connectedComponentsWithStats(mask, 8, cv2.CV_32S)
|
84 |
+
(numLabels, labels, stats, centroids) = output
|
85 |
+
stats, centroids = stats[1:], centroids[1:]
|
86 |
+
edges = []
|
87 |
+
for i in range(1, numLabels):
|
88 |
+
y,x = np.where(labels == i)
|
89 |
+
xleft_idx = np.argmin(x)
|
90 |
+
x_left = x[xleft_idx]
|
91 |
+
y_left = y[xleft_idx]
|
92 |
+
xright_idx = np.argmax(x)
|
93 |
+
x_right = x[xright_idx]
|
94 |
+
y_right = y[xright_idx]
|
95 |
+
edges.append((x_left, y_left, x_right, y_right))
|
96 |
+
cv2.line(line_img, (x_left, y_left), (x_right, y_right), (255, 255, 255), 2)
|
97 |
+
edges = np.array(edges)
|
98 |
+
if (len(apex_pts) < 2) or len(edges) <1:
|
99 |
+
continue
|
100 |
+
pts_to_edges_dist = np.minimum(cdist(apex_pts, edges[:,:2]), cdist(apex_pts, edges[:,2:]))
|
101 |
+
connectivity_mask = pts_to_edges_dist <= edge_th
|
102 |
+
edge_connects = connectivity_mask.sum(axis=0)
|
103 |
+
for edge_idx, edgesum in enumerate(edge_connects):
|
104 |
+
if edgesum>=2:
|
105 |
+
connected_verts = np.where(connectivity_mask[:,edge_idx])[0]
|
106 |
+
for a_i, a in enumerate(connected_verts):
|
107 |
+
for b in connected_verts[a_i+1:]:
|
108 |
+
connections.append((a, b))
|
109 |
+
return vertices, connections
|
110 |
+
|
111 |
+
def get_uv_depth(vertices, depth):
|
112 |
+
'''Get the depth of the vertices from the depth image'''
|
113 |
+
uv = []
|
114 |
+
for v in vertices:
|
115 |
+
uv.append(v['xy'])
|
116 |
+
uv = np.array(uv)
|
117 |
+
uv_int = uv.astype(np.int32)
|
118 |
+
H, W = depth.shape[:2]
|
119 |
+
uv_int[:, 0] = np.clip( uv_int[:, 0], 0, W-1)
|
120 |
+
uv_int[:, 1] = np.clip( uv_int[:, 1], 0, H-1)
|
121 |
+
vertex_depth = depth[(uv_int[:, 1] , uv_int[:, 0])]
|
122 |
+
return uv, vertex_depth
|
123 |
+
|
124 |
+
|
125 |
+
def merge_vertices_3d(vert_edge_per_image, th=0.1):
|
126 |
+
'''Merge vertices that are close to each other in 3D space and are of same types'''
|
127 |
+
all_3d_vertices = []
|
128 |
+
connections_3d = []
|
129 |
+
all_indexes = []
|
130 |
+
cur_start = 0
|
131 |
+
types = []
|
132 |
+
for cimg_idx, (vertices, connections, vertices_3d) in vert_edge_per_image.items():
|
133 |
+
types += [int(v['type']=='apex') for v in vertices]
|
134 |
+
all_3d_vertices.append(vertices_3d)
|
135 |
+
connections_3d+=[(x+cur_start,y+cur_start) for (x,y) in connections]
|
136 |
+
cur_start+=len(vertices_3d)
|
137 |
+
all_3d_vertices = np.concatenate(all_3d_vertices, axis=0)
|
138 |
+
#print (connections_3d)
|
139 |
+
distmat = cdist(all_3d_vertices, all_3d_vertices)
|
140 |
+
types = np.array(types).reshape(-1,1)
|
141 |
+
same_types = cdist(types, types)
|
142 |
+
mask_to_merge = (distmat <= th) & (same_types==0)
|
143 |
+
new_vertices = []
|
144 |
+
new_connections = []
|
145 |
+
to_merge = sorted(list(set([tuple(a.nonzero()[0].tolist()) for a in mask_to_merge])))
|
146 |
+
to_merge_final = defaultdict(list)
|
147 |
+
for i in range(len(all_3d_vertices)):
|
148 |
+
for j in to_merge:
|
149 |
+
if i in j:
|
150 |
+
to_merge_final[i]+=j
|
151 |
+
for k, v in to_merge_final.items():
|
152 |
+
to_merge_final[k] = list(set(v))
|
153 |
+
already_there = set()
|
154 |
+
merged = []
|
155 |
+
for k, v in to_merge_final.items():
|
156 |
+
if k in already_there:
|
157 |
+
continue
|
158 |
+
merged.append(v)
|
159 |
+
for vv in v:
|
160 |
+
already_there.add(vv)
|
161 |
+
old_idx_to_new = {}
|
162 |
+
count=0
|
163 |
+
for idxs in merged:
|
164 |
+
new_vertices.append(all_3d_vertices[idxs].mean(axis=0))
|
165 |
+
for idx in idxs:
|
166 |
+
old_idx_to_new[idx] = count
|
167 |
+
count +=1
|
168 |
+
#print (connections_3d)
|
169 |
+
new_vertices=np.array(new_vertices)
|
170 |
+
#print (connections_3d)
|
171 |
+
for conn in connections_3d:
|
172 |
+
new_con = sorted((old_idx_to_new[conn[0]], old_idx_to_new[conn[1]]))
|
173 |
+
if new_con[0] == new_con[1]:
|
174 |
+
continue
|
175 |
+
if new_con not in new_connections:
|
176 |
+
new_connections.append(new_con)
|
177 |
+
#print (f'{len(new_vertices)} left after merging {len(all_3d_vertices)} with {th=}')
|
178 |
+
return new_vertices, new_connections
|
179 |
+
|
180 |
+
def prune_not_connected(all_3d_vertices, connections_3d):
|
181 |
+
'''Prune vertices that are not connected to any other vertex'''
|
182 |
+
connected = defaultdict(list)
|
183 |
+
for c in connections_3d:
|
184 |
+
connected[c[0]].append(c)
|
185 |
+
connected[c[1]].append(c)
|
186 |
+
new_indexes = {}
|
187 |
+
new_verts = []
|
188 |
+
connected_out = []
|
189 |
+
for k,v in connected.items():
|
190 |
+
vert = all_3d_vertices[k]
|
191 |
+
if tuple(vert) not in new_verts:
|
192 |
+
new_verts.append(tuple(vert))
|
193 |
+
new_indexes[k]=len(new_verts) -1
|
194 |
+
for k,v in connected.items():
|
195 |
+
for vv in v:
|
196 |
+
connected_out.append((new_indexes[vv[0]],new_indexes[vv[1]]))
|
197 |
+
connected_out=list(set(connected_out))
|
198 |
+
|
199 |
+
return np.array(new_verts), connected_out
|
200 |
+
|
201 |
+
|
202 |
+
def predict(entry, visualize=False) -> Tuple[np.ndarray, List[int]]:
|
203 |
+
good_entry = convert_entry_to_human_readable(entry)
|
204 |
+
vert_edge_per_image = {}
|
205 |
+
for i, (gest, depth, K, R, t) in enumerate(zip(good_entry['gestalt'],
|
206 |
+
good_entry['depthcm'],
|
207 |
+
good_entry['K'],
|
208 |
+
good_entry['R'],
|
209 |
+
good_entry['t']
|
210 |
+
)):
|
211 |
+
gest_seg = gest.resize(depth.size)
|
212 |
+
gest_seg_np = np.array(gest_seg).astype(np.uint8)
|
213 |
+
# Metric3D
|
214 |
+
depth_np = np.array(depth) / 2.5 # 2.5 is the scale estimation coefficient
|
215 |
+
vertices, connections = get_vertices_and_edges_from_segmentation(gest_seg_np, edge_th = 5.)
|
216 |
+
if (len(vertices) < 2) or (len(connections) < 1):
|
217 |
+
print (f'Not enough vertices or connections in image {i}')
|
218 |
+
vert_edge_per_image[i] = np.empty((0, 2)), [], np.empty((0, 3))
|
219 |
+
continue
|
220 |
+
uv, depth_vert = get_uv_depth(vertices, depth_np)
|
221 |
+
# Normalize the uv to the camera intrinsics
|
222 |
+
xy_local = np.ones((len(uv), 3))
|
223 |
+
xy_local[:, 0] = (uv[:, 0] - K[0,2]) / K[0,0]
|
224 |
+
xy_local[:, 1] = (uv[:, 1] - K[1,2]) / K[1,1]
|
225 |
+
# Get the 3D vertices
|
226 |
+
vertices_3d_local = depth_vert[...,None] * (xy_local/np.linalg.norm(xy_local, axis=1)[...,None])
|
227 |
+
world_to_cam = np.eye(4)
|
228 |
+
world_to_cam[:3, :3] = R
|
229 |
+
world_to_cam[:3, 3] = t.reshape(-1)
|
230 |
+
cam_to_world = np.linalg.inv(world_to_cam)
|
231 |
+
vertices_3d = cv2.transform(cv2.convertPointsToHomogeneous(vertices_3d_local), cam_to_world)
|
232 |
+
vertices_3d = cv2.convertPointsFromHomogeneous(vertices_3d).reshape(-1, 3)
|
233 |
+
vert_edge_per_image[i] = vertices, connections, vertices_3d
|
234 |
+
all_3d_vertices, connections_3d = merge_vertices_3d(vert_edge_per_image, 3.0)
|
235 |
+
all_3d_vertices_clean, connections_3d_clean = prune_not_connected(all_3d_vertices, connections_3d)
|
236 |
+
if (len(all_3d_vertices_clean) < 2) or len(connections_3d_clean) < 1:
|
237 |
+
print (f'Not enough vertices or connections in the 3D vertices')
|
238 |
+
return (good_entry['__key__'], *empty_solution())
|
239 |
+
if visualize:
|
240 |
+
from hoho.viz3d import plot_estimate_and_gt
|
241 |
+
plot_estimate_and_gt( all_3d_vertices_clean,
|
242 |
+
connections_3d_clean,
|
243 |
+
good_entry['wf_vertices'],
|
244 |
+
good_entry['wf_edges'])
|
245 |
+
return good_entry['__key__'], all_3d_vertices_clean, connections_3d_clean
|
pc_util/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
pc_util/setup.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup
|
2 |
+
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
|
3 |
+
|
4 |
+
setup(
|
5 |
+
name='pc_util',
|
6 |
+
version='1.0',
|
7 |
+
ext_modules=[
|
8 |
+
CUDAExtension('pc_util', [
|
9 |
+
'src/pointnet2_api.cpp',
|
10 |
+
'src/ball_query.cpp',
|
11 |
+
'src/ball_query_gpu.cu',
|
12 |
+
'src/group_points.cpp',
|
13 |
+
'src/group_points_gpu.cu',
|
14 |
+
'src/interpolate.cpp',
|
15 |
+
'src/interpolate_gpu.cu',
|
16 |
+
'src/sampling.cpp',
|
17 |
+
'src/sampling_gpu.cu',
|
18 |
+
'src/cluster.cpp',
|
19 |
+
'src/cluster_gpu.cu',
|
20 |
+
], extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']})
|
21 |
+
],
|
22 |
+
cmdclass={'build_ext': BuildExtension}
|
23 |
+
)
|
pc_util/src/ball_query.cpp
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <torch/serialize/tensor.h>
|
2 |
+
#include <vector>
|
3 |
+
// #include <THC/THC.h>
|
4 |
+
#include <cuda.h>
|
5 |
+
#include <cuda_runtime_api.h>
|
6 |
+
#include "ball_query_gpu.h"
|
7 |
+
|
8 |
+
// extern THCState *state;
|
9 |
+
|
10 |
+
#include <ATen/cuda/CUDAContext.h>
|
11 |
+
#include <ATen/cuda/CUDAEvent.h>
|
12 |
+
// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
13 |
+
|
14 |
+
#define CHECK_CUDA(x) do { \
|
15 |
+
if (!x.type().is_cuda()) { \
|
16 |
+
fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
|
17 |
+
exit(-1); \
|
18 |
+
} \
|
19 |
+
} while (0)
|
20 |
+
#define CHECK_CONTIGUOUS(x) do { \
|
21 |
+
if (!x.is_contiguous()) { \
|
22 |
+
fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
|
23 |
+
exit(-1); \
|
24 |
+
} \
|
25 |
+
} while (0)
|
26 |
+
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
|
27 |
+
|
28 |
+
int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
|
29 |
+
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
|
30 |
+
CHECK_INPUT(new_xyz_tensor);
|
31 |
+
CHECK_INPUT(xyz_tensor);
|
32 |
+
const float *new_xyz = new_xyz_tensor.data<float>();
|
33 |
+
const float *xyz = xyz_tensor.data<float>();
|
34 |
+
int *idx = idx_tensor.data<int>();
|
35 |
+
|
36 |
+
ball_query_kernel_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx);
|
37 |
+
return 1;
|
38 |
+
}
|
39 |
+
|
40 |
+
|
41 |
+
int ball_center_query_wrapper_fast(int b, int n, int m, float radius,
|
42 |
+
at::Tensor point_tensor, at::Tensor key_point_tensor, at::Tensor idx_tensor) {
|
43 |
+
CHECK_INPUT(point_tensor);
|
44 |
+
CHECK_INPUT(key_point_tensor);
|
45 |
+
const float *point = point_tensor.data<float>();
|
46 |
+
const float *key_point = key_point_tensor.data<float>();
|
47 |
+
int *idx = idx_tensor.data<int>();
|
48 |
+
|
49 |
+
ball_center_query_kernel_launcher_fast(b, n, m, radius, point, key_point, idx);
|
50 |
+
return 1;
|
51 |
+
}
|
52 |
+
|
53 |
+
|
54 |
+
int knn_query_wrapper_fast(int b, int n, int m, int nsample,
|
55 |
+
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
|
56 |
+
CHECK_INPUT(new_xyz_tensor);
|
57 |
+
CHECK_INPUT(xyz_tensor);
|
58 |
+
const float *new_xyz = new_xyz_tensor.data<float>();
|
59 |
+
const float *xyz = xyz_tensor.data<float>();
|
60 |
+
float *dist2 = dist2_tensor.data<float>();
|
61 |
+
int *idx = idx_tensor.data<int>();
|
62 |
+
|
63 |
+
knn_query_kernel_launcher_fast(b, n, m, nsample, new_xyz, xyz, dist2, idx);
|
64 |
+
return 1;
|
65 |
+
}
|
66 |
+
|
67 |
+
|
68 |
+
int ball_query_wrapper_stack(int B, int M, float radius, int nsample,
|
69 |
+
at::Tensor new_xyz_tensor, at::Tensor new_xyz_batch_cnt_tensor,
|
70 |
+
at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor, at::Tensor idx_tensor) {
|
71 |
+
CHECK_INPUT(new_xyz_tensor);
|
72 |
+
CHECK_INPUT(xyz_tensor);
|
73 |
+
CHECK_INPUT(new_xyz_batch_cnt_tensor);
|
74 |
+
CHECK_INPUT(xyz_batch_cnt_tensor);
|
75 |
+
|
76 |
+
const float *new_xyz = new_xyz_tensor.data<float>();
|
77 |
+
const float *xyz = xyz_tensor.data<float>();
|
78 |
+
const int *new_xyz_batch_cnt = new_xyz_batch_cnt_tensor.data<int>();
|
79 |
+
const int *xyz_batch_cnt = xyz_batch_cnt_tensor.data<int>();
|
80 |
+
int *idx = idx_tensor.data<int>();
|
81 |
+
|
82 |
+
ball_query_kernel_launcher_stack(B, M, radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
|
83 |
+
return 1;
|
84 |
+
}
|
pc_util/src/ball_query_gpu.cu
ADDED
@@ -0,0 +1,270 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <math.h>
|
2 |
+
#include <stdio.h>
|
3 |
+
#include <stdlib.h>
|
4 |
+
|
5 |
+
|
6 |
+
#include "ball_query_gpu.h"
|
7 |
+
#include "cuda_utils.h"
|
8 |
+
|
9 |
+
|
10 |
+
__global__ void ball_query_kernel_fast(int b, int n, int m, float radius, int nsample,
|
11 |
+
const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
|
12 |
+
// new_xyz: (B, M, 3)
|
13 |
+
// xyz: (B, N, 3)
|
14 |
+
// output:
|
15 |
+
// idx: (B, M, nsample)
|
16 |
+
int bs_idx = blockIdx.y;
|
17 |
+
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
18 |
+
if (bs_idx >= b || pt_idx >= m) return;
|
19 |
+
|
20 |
+
new_xyz += bs_idx * m * 3 + pt_idx * 3;
|
21 |
+
xyz += bs_idx * n * 3;
|
22 |
+
idx += bs_idx * m * nsample + pt_idx * nsample;
|
23 |
+
|
24 |
+
float radius2 = radius * radius;
|
25 |
+
float new_x = new_xyz[0];
|
26 |
+
float new_y = new_xyz[1];
|
27 |
+
float new_z = new_xyz[2];
|
28 |
+
|
29 |
+
int cnt = 0;
|
30 |
+
for (int k = 0; k < n; ++k) {
|
31 |
+
float x = xyz[k * 3 + 0];
|
32 |
+
float y = xyz[k * 3 + 1];
|
33 |
+
float z = xyz[k * 3 + 2];
|
34 |
+
float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
|
35 |
+
if (d2 < radius2){
|
36 |
+
if (cnt == 0){
|
37 |
+
for (int l = 0; l < nsample; ++l) {
|
38 |
+
idx[l] = k;
|
39 |
+
}
|
40 |
+
}
|
41 |
+
idx[cnt] = k;
|
42 |
+
++cnt;
|
43 |
+
if (cnt >= nsample) break;
|
44 |
+
}
|
45 |
+
}
|
46 |
+
}
|
47 |
+
|
48 |
+
|
49 |
+
void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, \
|
50 |
+
const float *new_xyz, const float *xyz, int *idx) {
|
51 |
+
// new_xyz: (B, M, 3)
|
52 |
+
// xyz: (B, N, 3)
|
53 |
+
// output:
|
54 |
+
// idx: (B, M, nsample)
|
55 |
+
|
56 |
+
cudaError_t err;
|
57 |
+
|
58 |
+
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
|
59 |
+
dim3 threads(THREADS_PER_BLOCK);
|
60 |
+
|
61 |
+
ball_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
|
62 |
+
// cudaDeviceSynchronize(); // for using printf in kernel function
|
63 |
+
err = cudaGetLastError();
|
64 |
+
if (cudaSuccess != err) {
|
65 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
66 |
+
exit(-1);
|
67 |
+
}
|
68 |
+
}
|
69 |
+
|
70 |
+
|
71 |
+
__global__ void ball_center_query_kernel_fast(int b, int n, int m, float radius, \
|
72 |
+
const float *__restrict__ point, const float *__restrict__ key_point, int *__restrict__ idx) {
|
73 |
+
// key_point: (B, M, 3)
|
74 |
+
// point: (B, N, 3)
|
75 |
+
// output:
|
76 |
+
// idx: (B, N)
|
77 |
+
int bs_idx = blockIdx.y;
|
78 |
+
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
79 |
+
if (bs_idx >= b || pt_idx >= n) return;
|
80 |
+
|
81 |
+
point += bs_idx * n * 3 + pt_idx * 3;
|
82 |
+
key_point += bs_idx * m * 3;
|
83 |
+
idx += bs_idx * n + pt_idx;
|
84 |
+
|
85 |
+
float radius2 = radius * radius;
|
86 |
+
float point_x = point[0];
|
87 |
+
float point_y = point[1];
|
88 |
+
float point_z = point[2];
|
89 |
+
|
90 |
+
float bestd = 1e8;
|
91 |
+
for (int k = 0; k < m; ++k) {
|
92 |
+
float x = key_point[k * 3 + 0];
|
93 |
+
float y = key_point[k * 3 + 1];
|
94 |
+
float z = key_point[k * 3 + 2];
|
95 |
+
if (((x + 1) * (x + 1) + (y + 1) * (y + 1) + (z + 1) * (z + 1)) < 1e-4) break;
|
96 |
+
float d2 = (point_x - x) * (point_x - x) + (point_y - y) * (point_y - y) + (point_z - z) * (point_z - z);
|
97 |
+
if (d2 < radius2 && d2 < bestd){
|
98 |
+
idx[0] = k;
|
99 |
+
bestd = d2;
|
100 |
+
}
|
101 |
+
}
|
102 |
+
}
|
103 |
+
|
104 |
+
|
105 |
+
void ball_center_query_kernel_launcher_fast(int b, int n, int m, float radius, \
|
106 |
+
const float *point, const float *key_point, int *idx) {
|
107 |
+
// point: (B, n, 3)
|
108 |
+
// key_point: (B, m, 3)
|
109 |
+
// output:
|
110 |
+
// idx: (B, n)
|
111 |
+
|
112 |
+
cudaError_t err;
|
113 |
+
|
114 |
+
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
|
115 |
+
dim3 threads(THREADS_PER_BLOCK);
|
116 |
+
|
117 |
+
ball_center_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, point, key_point, idx);
|
118 |
+
// cudaDeviceSynchronize(); // for using printf in kernel function
|
119 |
+
err = cudaGetLastError();
|
120 |
+
if (cudaSuccess != err) {
|
121 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
122 |
+
exit(-1);
|
123 |
+
}
|
124 |
+
}
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
__global__ void knn_query_kernel_fast(int b, int n, int m, int nsample, const float *__restrict__ new_xyz,
|
131 |
+
const float *__restrict__ xyz, float *__restrict__ dist2, int *__restrict__ idx) {
|
132 |
+
|
133 |
+
// new_xyz: (B, M, 3)
|
134 |
+
// xyz: (B, N, 3)
|
135 |
+
// output:
|
136 |
+
// dist2: (B, M, nsample)
|
137 |
+
// idx: (B, M, nsample)
|
138 |
+
|
139 |
+
int bs_idx = blockIdx.y;
|
140 |
+
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
141 |
+
if (bs_idx >= b || pt_idx >= m) return;
|
142 |
+
|
143 |
+
new_xyz += bs_idx * m * 3 + pt_idx * 3;
|
144 |
+
xyz += bs_idx * n * 3;
|
145 |
+
dist2 += bs_idx * m * nsample + pt_idx * nsample;
|
146 |
+
idx += bs_idx * m * nsample + pt_idx * nsample;
|
147 |
+
|
148 |
+
float nx = new_xyz[0];
|
149 |
+
float ny = new_xyz[1];
|
150 |
+
float nz = new_xyz[2];
|
151 |
+
|
152 |
+
for (int i = 0; i < n; ++i) {
|
153 |
+
float x = xyz[i * 3 + 0];
|
154 |
+
float y = xyz[i * 3 + 1];
|
155 |
+
float z = xyz[i * 3 + 2];
|
156 |
+
float d2 = (nx - x) * (nx - x) + (ny - y) * (ny - y) + (nz - z) * (nz - z);
|
157 |
+
if (d2 < dist2[nsample - 1]) {
|
158 |
+
dist2[nsample - 1] = d2;
|
159 |
+
idx[nsample - 1] = i;
|
160 |
+
for (int j = nsample - 2; j >= 0; j--) {
|
161 |
+
if (d2 < dist2[j]){
|
162 |
+
dist2[j + 1] = dist2[j];
|
163 |
+
dist2[j] = d2;
|
164 |
+
idx[j + 1] = idx[j];
|
165 |
+
idx[j] = i;
|
166 |
+
}
|
167 |
+
}
|
168 |
+
}
|
169 |
+
}
|
170 |
+
}
|
171 |
+
|
172 |
+
|
173 |
+
void knn_query_kernel_launcher_fast(int b, int n, int m, int nsample, \
|
174 |
+
const float *new_xyz, const float *xyz, float *dist2, int *idx) {
|
175 |
+
cudaError_t err;
|
176 |
+
|
177 |
+
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
|
178 |
+
dim3 threads(THREADS_PER_BLOCK);
|
179 |
+
|
180 |
+
knn_query_kernel_fast<<<blocks, threads>>>(b, n, m, nsample, new_xyz, xyz, dist2, idx);
|
181 |
+
// cudaDeviceSynchronize(); // for using printf in kernel function
|
182 |
+
err = cudaGetLastError();
|
183 |
+
if (cudaSuccess != err) {
|
184 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
185 |
+
exit(-1);
|
186 |
+
}
|
187 |
+
}
|
188 |
+
|
189 |
+
|
190 |
+
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
+
|
195 |
+
|
196 |
+
__global__ void ball_query_kernel_stack(int B, int M, float radius, int nsample, \
|
197 |
+
const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx) {
|
198 |
+
// :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
|
199 |
+
// :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
|
200 |
+
// :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
|
201 |
+
// :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
|
202 |
+
// output:
|
203 |
+
// idx: (M, nsample)
|
204 |
+
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
205 |
+
if (pt_idx >= M) return;
|
206 |
+
|
207 |
+
int bs_idx = 0, pt_cnt = new_xyz_batch_cnt[0];
|
208 |
+
for (int k = 1; k < B; k++){
|
209 |
+
if (pt_idx < pt_cnt) break;
|
210 |
+
pt_cnt += new_xyz_batch_cnt[k];
|
211 |
+
bs_idx = k;
|
212 |
+
}
|
213 |
+
|
214 |
+
int xyz_batch_start_idx = 0;
|
215 |
+
for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];
|
216 |
+
// for (int k = 0; k < bs_idx; k++) new_xyz_batch_start_idx += new_xyz_batch_cnt[k];
|
217 |
+
|
218 |
+
new_xyz += pt_idx * 3;
|
219 |
+
xyz += xyz_batch_start_idx * 3;
|
220 |
+
idx += pt_idx * nsample;
|
221 |
+
|
222 |
+
float radius2 = radius * radius;
|
223 |
+
float new_x = new_xyz[0];
|
224 |
+
float new_y = new_xyz[1];
|
225 |
+
float new_z = new_xyz[2];
|
226 |
+
int n = xyz_batch_cnt[bs_idx];
|
227 |
+
|
228 |
+
int cnt = 0;
|
229 |
+
for (int k = 0; k < n; ++k) {
|
230 |
+
float x = xyz[k * 3 + 0];
|
231 |
+
float y = xyz[k * 3 + 1];
|
232 |
+
float z = xyz[k * 3 + 2];
|
233 |
+
float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
|
234 |
+
if (d2 < radius2){
|
235 |
+
if (cnt == 0){
|
236 |
+
for (int l = 0; l < nsample; ++l) {
|
237 |
+
idx[l] = k;
|
238 |
+
}
|
239 |
+
}
|
240 |
+
idx[cnt] = k;
|
241 |
+
++cnt;
|
242 |
+
if (cnt >= nsample) break;
|
243 |
+
}
|
244 |
+
}
|
245 |
+
if (cnt == 0) idx[0] = -1;
|
246 |
+
}
|
247 |
+
|
248 |
+
|
249 |
+
void ball_query_kernel_launcher_stack(int B, int M, float radius, int nsample,
|
250 |
+
const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx){
|
251 |
+
// :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
|
252 |
+
// :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
|
253 |
+
// :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
|
254 |
+
// :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
|
255 |
+
// output:
|
256 |
+
// idx: (M, nsample)
|
257 |
+
|
258 |
+
cudaError_t err;
|
259 |
+
|
260 |
+
dim3 blocks(DIVUP(M, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
|
261 |
+
dim3 threads(THREADS_PER_BLOCK);
|
262 |
+
|
263 |
+
ball_query_kernel_stack<<<blocks, threads>>>(B, M, radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
|
264 |
+
// cudaDeviceSynchronize(); // for using printf in kernel function
|
265 |
+
err = cudaGetLastError();
|
266 |
+
if (cudaSuccess != err) {
|
267 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
268 |
+
exit(-1);
|
269 |
+
}
|
270 |
+
}
|
pc_util/src/ball_query_gpu.h
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#ifndef _BALL_QUERY_GPU_H
|
2 |
+
#define _BALL_QUERY_GPU_H
|
3 |
+
|
4 |
+
#include <torch/serialize/tensor.h>
|
5 |
+
#include <vector>
|
6 |
+
#include <cuda.h>
|
7 |
+
#include <cuda_runtime_api.h>
|
8 |
+
|
9 |
+
int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
|
10 |
+
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
|
11 |
+
|
12 |
+
void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample,
|
13 |
+
const float *new_xyz, const float *xyz, int *idx);
|
14 |
+
|
15 |
+
int ball_center_query_wrapper_fast(int b, int n, int m, float radius,
|
16 |
+
at::Tensor point_tensor, at::Tensor key_point_tensor, at::Tensor idx_tensor);
|
17 |
+
|
18 |
+
void ball_center_query_kernel_launcher_fast(int b, int n, int m, float radius,
|
19 |
+
const float *point, const float *key_point, int *idx);
|
20 |
+
|
21 |
+
int knn_query_wrapper_fast(int b, int n, int m, int nsample,
|
22 |
+
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
|
23 |
+
|
24 |
+
void knn_query_kernel_launcher_fast(int b, int n, int m, int nsample,
|
25 |
+
const float *new_xyz, const float *xyz, float *dist2, int *idx);
|
26 |
+
|
27 |
+
|
28 |
+
int ball_query_wrapper_stack(int B, int M, float radius, int nsample,
|
29 |
+
at::Tensor new_xyz_tensor, at::Tensor new_xyz_batch_cnt_tensor,
|
30 |
+
at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor, at::Tensor idx_tensor);
|
31 |
+
|
32 |
+
|
33 |
+
void ball_query_kernel_launcher_stack(int B, int M, float radius, int nsample,
|
34 |
+
const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx);
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
#endif
|
pc_util/src/cluster.cpp
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <torch/serialize/tensor.h>
|
2 |
+
#include <vector>
|
3 |
+
// #include <THC/THC.h>
|
4 |
+
#include <cuda.h>
|
5 |
+
#include <cuda_runtime_api.h>
|
6 |
+
#include "cluster_gpu.h"
|
7 |
+
|
8 |
+
// extern THCState *state;
|
9 |
+
|
10 |
+
#include <ATen/cuda/CUDAContext.h>
|
11 |
+
#include <ATen/cuda/CUDAEvent.h>
|
12 |
+
// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
13 |
+
|
14 |
+
#define CHECK_CUDA(x) do { \
|
15 |
+
if (!x.type().is_cuda()) { \
|
16 |
+
fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
|
17 |
+
exit(-1); \
|
18 |
+
} \
|
19 |
+
} while (0)
|
20 |
+
#define CHECK_CONTIGUOUS(x) do { \
|
21 |
+
if (!x.is_contiguous()) { \
|
22 |
+
fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
|
23 |
+
exit(-1); \
|
24 |
+
} \
|
25 |
+
} while (0)
|
26 |
+
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
|
27 |
+
|
28 |
+
int dbscan_wrapper_fast(int b, int n, float eps, int min_pts, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
|
29 |
+
CHECK_INPUT(xyz_tensor);
|
30 |
+
const float *xyz = xyz_tensor.data<float>();
|
31 |
+
int *idx = idx_tensor.data<int>();
|
32 |
+
|
33 |
+
dbscan_kernel_launcher_fast(b, n, eps, min_pts, xyz, idx);
|
34 |
+
return 1;
|
35 |
+
}
|
36 |
+
|
37 |
+
|
38 |
+
int cluster_pts_wrapper_fast(int b, int n, int m, at::Tensor xyz_tensor, at::Tensor idx_tensor,
|
39 |
+
at::Tensor new_xyz_tensor, at::Tensor num_tensor) {
|
40 |
+
CHECK_INPUT(xyz_tensor);
|
41 |
+
CHECK_INPUT(idx_tensor);
|
42 |
+
const float *xyz = xyz_tensor.data<float>();
|
43 |
+
const int *idx = idx_tensor.data<int>();
|
44 |
+
float *new_xyz = new_xyz_tensor.data<float>();
|
45 |
+
int *num = num_tensor.data<int>();
|
46 |
+
|
47 |
+
cluster_pts_kernel_launcher_fast(b, n, m, xyz, idx, new_xyz, num);
|
48 |
+
return 1;
|
49 |
+
}
|
50 |
+
|
pc_util/src/cluster_gpu.cu
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <math.h>
|
2 |
+
#include <stdio.h>
|
3 |
+
#include <stdlib.h>
|
4 |
+
|
5 |
+
|
6 |
+
#include "cluster_gpu.h"
|
7 |
+
#include "cuda_utils.h"
|
8 |
+
|
9 |
+
|
10 |
+
__device__ float get_dis(float x1, float y1, float z1, float x2, float y2, float z2) {
|
11 |
+
float dis = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2) + (z1 - z2) * (z1 - z2);
|
12 |
+
return sqrt(dis);
|
13 |
+
}
|
14 |
+
/*
|
15 |
+
__device__ void dfs (int i, int c, int n, int min_pts, const int* pts_cnt, const int* pts_adj, int* idx, int label) {
|
16 |
+
idx[i] = c;
|
17 |
+
if(pts_cnt[i] < min_pts) return;
|
18 |
+
|
19 |
+
for(int j=0;j<n;j++) {
|
20 |
+
|
21 |
+
int adj = pts_adj[i * n + j];
|
22 |
+
printf("%d %d %d\n", i * n, i * n + j, adj);
|
23 |
+
if (adj == -1) break;
|
24 |
+
if (idx[adj] == -1)
|
25 |
+
dfs(adj, c, n, min_pts, pts_cnt, pts_adj, idx, label);
|
26 |
+
}
|
27 |
+
}
|
28 |
+
*/
|
29 |
+
|
30 |
+
__global__ void dbscan_kernel_fast(int b, int n, float eps, int min_pts, const float *__restrict__ xyz, int *__restrict__ idx,
|
31 |
+
int *__restrict__ pts_cnt, int *__restrict__ pts_adj, int *__restrict__ pts_stack) {
|
32 |
+
// xyz: (B, N, 3)
|
33 |
+
// output:
|
34 |
+
// idx: (B, N)
|
35 |
+
int bs_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
36 |
+
if (bs_idx >= b) return;
|
37 |
+
|
38 |
+
xyz += bs_idx * n * 3;
|
39 |
+
idx += bs_idx * n;
|
40 |
+
pts_cnt += bs_idx * n;
|
41 |
+
pts_stack += bs_idx * n;
|
42 |
+
pts_adj += bs_idx * n * n;
|
43 |
+
|
44 |
+
for(int i=0;i<n;i++) {
|
45 |
+
pts_cnt[i] = 0;
|
46 |
+
for(int j=0;j<n;j++) {
|
47 |
+
pts_adj[i * n + j] = -1;
|
48 |
+
if(i==j) continue;
|
49 |
+
float x1 = xyz[i * 3 + 0];
|
50 |
+
float y1 = xyz[i * 3 + 1];
|
51 |
+
float z1 = xyz[i * 3 + 2];
|
52 |
+
float x2 = xyz[j * 3 + 0];
|
53 |
+
float y2 = xyz[j * 3 + 1];
|
54 |
+
float z2 = xyz[j * 3 + 2];
|
55 |
+
|
56 |
+
if(get_dis(x2, y2, z2, -10.0, -10.0, -10.0) < 1e-3) continue;
|
57 |
+
if(get_dis(x1, y1, z1, x2, y2, z2) <= eps) {
|
58 |
+
pts_adj[i * n + pts_cnt[i]] = j;
|
59 |
+
pts_cnt[i] += 1;
|
60 |
+
}
|
61 |
+
|
62 |
+
}
|
63 |
+
}
|
64 |
+
|
65 |
+
int cluster_idx = 0;
|
66 |
+
|
67 |
+
for(int i=0;i<n;i++) {
|
68 |
+
if(idx[i] != -1) continue;
|
69 |
+
|
70 |
+
if(pts_cnt[i] >= min_pts) {
|
71 |
+
for(int j=0;j<n;j++)
|
72 |
+
pts_stack[j] = -1;
|
73 |
+
pts_stack[0] = i;
|
74 |
+
int stack_idx = 0;
|
75 |
+
int stack_len = 1;
|
76 |
+
while (stack_idx < n && pts_stack[stack_idx] != -1)
|
77 |
+
{
|
78 |
+
int pts_idx = pts_stack[stack_idx];
|
79 |
+
idx[pts_idx] = cluster_idx;
|
80 |
+
if(pts_cnt[pts_idx] < min_pts){
|
81 |
+
stack_idx += 1;
|
82 |
+
continue;
|
83 |
+
}
|
84 |
+
for(int j=0;j<n;j++) {
|
85 |
+
int adj = pts_adj[pts_idx * n + j];
|
86 |
+
if (adj == -1) break;
|
87 |
+
if (idx[adj] == -1)
|
88 |
+
{
|
89 |
+
idx[adj] = -2;
|
90 |
+
pts_stack[stack_len++] = adj;
|
91 |
+
}
|
92 |
+
}
|
93 |
+
stack_idx += 1;
|
94 |
+
}
|
95 |
+
cluster_idx += 1;
|
96 |
+
}
|
97 |
+
}
|
98 |
+
}
|
99 |
+
|
100 |
+
|
101 |
+
void dbscan_kernel_launcher_fast(int b, int n, float eps, int min_pts, const float *xyz, int *idx) {
|
102 |
+
// xyz: (B, N, 3)
|
103 |
+
// output:
|
104 |
+
// idx: (B, N)
|
105 |
+
|
106 |
+
cudaError_t err;
|
107 |
+
|
108 |
+
dim3 blocks(DIVUP(b, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
|
109 |
+
dim3 threads(THREADS_PER_BLOCK);
|
110 |
+
|
111 |
+
int* pts_cnt;
|
112 |
+
int* pts_stack;
|
113 |
+
int* pts_adj;
|
114 |
+
|
115 |
+
err = cudaMalloc((void**)&pts_cnt, b * n * sizeof(int));
|
116 |
+
if (cudaSuccess != err) {
|
117 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
118 |
+
exit(-1);
|
119 |
+
}
|
120 |
+
|
121 |
+
err = cudaMalloc((void**)&pts_stack, b * n * sizeof(int));
|
122 |
+
if (cudaSuccess != err) {
|
123 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
124 |
+
exit(-1);
|
125 |
+
}
|
126 |
+
|
127 |
+
err = cudaMalloc((void**)&pts_adj, b * n * n * sizeof(int));
|
128 |
+
if (cudaSuccess != err) {
|
129 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
130 |
+
exit(-1);
|
131 |
+
}
|
132 |
+
|
133 |
+
dbscan_kernel_fast<<<blocks, threads>>>(b, n, eps, min_pts, xyz, idx, pts_cnt, pts_adj, pts_stack);
|
134 |
+
// cudaDeviceSynchronize(); // for using printf in kernel function
|
135 |
+
cudaFree(pts_cnt);
|
136 |
+
cudaFree(pts_stack);
|
137 |
+
cudaFree(pts_adj);
|
138 |
+
err = cudaGetLastError();
|
139 |
+
if (cudaSuccess != err) {
|
140 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
141 |
+
exit(-1);
|
142 |
+
}
|
143 |
+
}
|
144 |
+
|
145 |
+
|
146 |
+
|
147 |
+
__global__ void cluster_pts_kernel_fast(int b, int n, int m, const float *__restrict__ xyz, const int *__restrict__ idx,
|
148 |
+
float *__restrict__ new_xyz, int *__restrict__ num) {
|
149 |
+
int bs_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
150 |
+
if (bs_idx >= b ) return;
|
151 |
+
|
152 |
+
xyz += bs_idx * n * 3;
|
153 |
+
idx += bs_idx * n;
|
154 |
+
new_xyz += bs_idx * m * 3;
|
155 |
+
num += bs_idx * m;
|
156 |
+
|
157 |
+
for(int i=0;i<n;i++) {
|
158 |
+
if (idx[i] == -1) continue;
|
159 |
+
int c_idx = idx[i];
|
160 |
+
new_xyz[c_idx * 3 + 0] += xyz[i * 3 + 0];
|
161 |
+
new_xyz[c_idx * 3 + 1] += xyz[i * 3 + 1];
|
162 |
+
new_xyz[c_idx * 3 + 2] += xyz[i * 3 + 2];
|
163 |
+
num[c_idx] += 1;
|
164 |
+
}
|
165 |
+
for(int i=0;i<m;i++) {
|
166 |
+
if (num[i] == 0) break;
|
167 |
+
new_xyz[i * 3 + 0] /= num[i];
|
168 |
+
new_xyz[i * 3 + 1] /= num[i];
|
169 |
+
new_xyz[i * 3 + 2] /= num[i];
|
170 |
+
}
|
171 |
+
|
172 |
+
}
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
void cluster_pts_kernel_launcher_fast(int b, int n, int m, const float *xyz, const int *idx, float *new_xyz, int *num) {
|
178 |
+
cudaError_t err;
|
179 |
+
|
180 |
+
dim3 blocks(DIVUP(b, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
|
181 |
+
dim3 threads(THREADS_PER_BLOCK);
|
182 |
+
|
183 |
+
cluster_pts_kernel_fast<<<blocks, threads>>>(b, n, m, xyz, idx, new_xyz, num);
|
184 |
+
// cudaDeviceSynchronize(); // for using printf in kernel function
|
185 |
+
err = cudaGetLastError();
|
186 |
+
if (cudaSuccess != err) {
|
187 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
188 |
+
exit(-1);
|
189 |
+
}
|
190 |
+
}
|
191 |
+
|
192 |
+
|
pc_util/src/cluster_gpu.h
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#ifndef _CLUSTER_GPU_H
|
2 |
+
#define _CLUSTER_GPU_H
|
3 |
+
|
4 |
+
#include <torch/serialize/tensor.h>
|
5 |
+
#include <vector>
|
6 |
+
#include <cuda.h>
|
7 |
+
#include <cuda_runtime_api.h>
|
8 |
+
|
9 |
+
int dbscan_wrapper_fast(int b, int n, float eps, int min_pts, at::Tensor xyz_tensor, at::Tensor idx_tensor);
|
10 |
+
|
11 |
+
void dbscan_kernel_launcher_fast(int b, int n, float eps, int min_pts, const float *xyz, int *idx);
|
12 |
+
|
13 |
+
int cluster_pts_wrapper_fast(int b, int n, int m, at::Tensor xyz_tensor, at::Tensor idx_tensor,
|
14 |
+
at::Tensor new_xyz_tensor, at::Tensor num_tensor);
|
15 |
+
|
16 |
+
void cluster_pts_kernel_launcher_fast(int b, int n, int m, const float *xyz, const int *idx, float *new_xyz, int *num);
|
17 |
+
|
18 |
+
|
19 |
+
int dbscan_wrapper_stack(int b, int n, float eps, int min_pts, at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor,
|
20 |
+
at::Tensor idx_tensor);
|
21 |
+
|
22 |
+
|
23 |
+
void dbscan_kernel_launcher_stack(int b, int n, float eps, int min_pts,
|
24 |
+
const float *xyz, const int *xyz_batch_cnt, int *idx);
|
25 |
+
|
26 |
+
int cluster_pts_wrapper_stack(int B, at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor, at::Tensor idx_tensor,
|
27 |
+
at::Tensor new_xyz_tensor, at::Tensor cluster_cnt_tensor);
|
28 |
+
|
29 |
+
|
30 |
+
void cluster_pts_kernel_launcher_stack(int B, const float *xyz, const int *xyz_batch_cnt, int *idx,
|
31 |
+
const float *new_xyz, const int *cluster_cnt);
|
32 |
+
|
33 |
+
#endif
|
34 |
+
|
pc_util/src/cuda_utils.h
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#ifndef _CUDA_UTILS_H
|
2 |
+
#define _CUDA_UTILS_H
|
3 |
+
|
4 |
+
#include <cmath>
|
5 |
+
|
6 |
+
#define TOTAL_THREADS 1024
|
7 |
+
#define THREADS_PER_BLOCK 256
|
8 |
+
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
|
9 |
+
|
10 |
+
inline int opt_n_threads(int work_size) {
|
11 |
+
const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
|
12 |
+
|
13 |
+
return max(min(1 << pow_2, TOTAL_THREADS), 1);
|
14 |
+
}
|
15 |
+
#endif
|
pc_util/src/group_points.cpp
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <torch/serialize/tensor.h>
|
2 |
+
#include <cuda.h>
|
3 |
+
#include <cuda_runtime_api.h>
|
4 |
+
#include <vector>
|
5 |
+
// #include <THC/THC.h>
|
6 |
+
#include "group_points_gpu.h"
|
7 |
+
|
8 |
+
// extern THCState *state;
|
9 |
+
|
10 |
+
#include <ATen/cuda/CUDAContext.h>
|
11 |
+
#include <ATen/cuda/CUDAEvent.h>
|
12 |
+
// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
13 |
+
|
14 |
+
#define CHECK_CUDA(x) do { \
|
15 |
+
if (!x.type().is_cuda()) { \
|
16 |
+
fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
|
17 |
+
exit(-1); \
|
18 |
+
} \
|
19 |
+
} while (0)
|
20 |
+
#define CHECK_CONTIGUOUS(x) do { \
|
21 |
+
if (!x.is_contiguous()) { \
|
22 |
+
fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
|
23 |
+
exit(-1); \
|
24 |
+
} \
|
25 |
+
} while (0)
|
26 |
+
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
|
31 |
+
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
|
32 |
+
|
33 |
+
float *grad_points = grad_points_tensor.data<float>();
|
34 |
+
const int *idx = idx_tensor.data<int>();
|
35 |
+
const float *grad_out = grad_out_tensor.data<float>();
|
36 |
+
|
37 |
+
group_points_grad_kernel_launcher_fast(b, c, n, npoints, nsample, grad_out, idx, grad_points);
|
38 |
+
return 1;
|
39 |
+
}
|
40 |
+
|
41 |
+
|
42 |
+
int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
|
43 |
+
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
|
44 |
+
|
45 |
+
const float *points = points_tensor.data<float>();
|
46 |
+
const int *idx = idx_tensor.data<int>();
|
47 |
+
float *out = out_tensor.data<float>();
|
48 |
+
|
49 |
+
group_points_kernel_launcher_fast(b, c, n, npoints, nsample, points, idx, out);
|
50 |
+
return 1;
|
51 |
+
}
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
int group_points_grad_wrapper_stack(int B, int M, int C, int N, int nsample,
|
60 |
+
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor,
|
61 |
+
at::Tensor features_batch_cnt_tensor, at::Tensor grad_features_tensor) {
|
62 |
+
|
63 |
+
CHECK_INPUT(grad_out_tensor);
|
64 |
+
CHECK_INPUT(idx_tensor);
|
65 |
+
CHECK_INPUT(idx_batch_cnt_tensor);
|
66 |
+
CHECK_INPUT(features_batch_cnt_tensor);
|
67 |
+
CHECK_INPUT(grad_features_tensor);
|
68 |
+
|
69 |
+
const float *grad_out = grad_out_tensor.data<float>();
|
70 |
+
const int *idx = idx_tensor.data<int>();
|
71 |
+
const int *idx_batch_cnt = idx_batch_cnt_tensor.data<int>();
|
72 |
+
const int *features_batch_cnt = features_batch_cnt_tensor.data<int>();
|
73 |
+
float *grad_features = grad_features_tensor.data<float>();
|
74 |
+
|
75 |
+
group_points_grad_kernel_launcher_stack(B, M, C, N, nsample, grad_out, idx, idx_batch_cnt, features_batch_cnt, grad_features);
|
76 |
+
return 1;
|
77 |
+
}
|
78 |
+
|
79 |
+
|
80 |
+
int group_points_wrapper_stack(int B, int M, int C, int nsample,
|
81 |
+
at::Tensor features_tensor, at::Tensor features_batch_cnt_tensor,
|
82 |
+
at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor, at::Tensor out_tensor) {
|
83 |
+
|
84 |
+
CHECK_INPUT(features_tensor);
|
85 |
+
CHECK_INPUT(features_batch_cnt_tensor);
|
86 |
+
CHECK_INPUT(idx_tensor);
|
87 |
+
CHECK_INPUT(idx_batch_cnt_tensor);
|
88 |
+
CHECK_INPUT(out_tensor);
|
89 |
+
|
90 |
+
const float *features = features_tensor.data<float>();
|
91 |
+
const int *idx = idx_tensor.data<int>();
|
92 |
+
const int *features_batch_cnt = features_batch_cnt_tensor.data<int>();
|
93 |
+
const int *idx_batch_cnt = idx_batch_cnt_tensor.data<int>();
|
94 |
+
float *out = out_tensor.data<float>();
|
95 |
+
|
96 |
+
group_points_kernel_launcher_stack(B, M, C, nsample, features, features_batch_cnt, idx, idx_batch_cnt, out);
|
97 |
+
return 1;
|
98 |
+
}
|
pc_util/src/group_points_gpu.cu
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <stdio.h>
|
2 |
+
#include <stdlib.h>
|
3 |
+
|
4 |
+
#include "cuda_utils.h"
|
5 |
+
#include "group_points_gpu.h"
|
6 |
+
|
7 |
+
|
8 |
+
__global__ void group_points_grad_kernel_fast(int b, int c, int n, int npoints, int nsample,
|
9 |
+
const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
|
10 |
+
// grad_out: (B, C, npoints, nsample)
|
11 |
+
// idx: (B, npoints, nsample)
|
12 |
+
// output:
|
13 |
+
// grad_points: (B, C, N)
|
14 |
+
int bs_idx = blockIdx.z;
|
15 |
+
int c_idx = blockIdx.y;
|
16 |
+
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
17 |
+
int pt_idx = index / nsample;
|
18 |
+
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
|
19 |
+
|
20 |
+
int sample_idx = index % nsample;
|
21 |
+
grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
|
22 |
+
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
|
23 |
+
|
24 |
+
atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
|
25 |
+
}
|
26 |
+
|
27 |
+
void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
|
28 |
+
const float *grad_out, const int *idx, float *grad_points) {
|
29 |
+
// grad_out: (B, C, npoints, nsample)
|
30 |
+
// idx: (B, npoints, nsample)
|
31 |
+
// output:
|
32 |
+
// grad_points: (B, C, N)
|
33 |
+
cudaError_t err;
|
34 |
+
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
|
35 |
+
dim3 threads(THREADS_PER_BLOCK);
|
36 |
+
|
37 |
+
group_points_grad_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
|
38 |
+
|
39 |
+
err = cudaGetLastError();
|
40 |
+
if (cudaSuccess != err) {
|
41 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
42 |
+
exit(-1);
|
43 |
+
}
|
44 |
+
}
|
45 |
+
|
46 |
+
|
47 |
+
__global__ void group_points_kernel_fast(int b, int c, int n, int npoints, int nsample,
|
48 |
+
const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
|
49 |
+
// points: (B, C, N)
|
50 |
+
// idx: (B, npoints, nsample)
|
51 |
+
// output:
|
52 |
+
// out: (B, C, npoints, nsample)
|
53 |
+
int bs_idx = blockIdx.z;
|
54 |
+
int c_idx = blockIdx.y;
|
55 |
+
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
56 |
+
int pt_idx = index / nsample;
|
57 |
+
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
|
58 |
+
|
59 |
+
int sample_idx = index % nsample;
|
60 |
+
|
61 |
+
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
|
62 |
+
int in_idx = bs_idx * c * n + c_idx * n + idx[0];
|
63 |
+
int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
|
64 |
+
|
65 |
+
out[out_idx] = points[in_idx];
|
66 |
+
}
|
67 |
+
|
68 |
+
|
69 |
+
void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
|
70 |
+
const float *points, const int *idx, float *out) {
|
71 |
+
// points: (B, C, N)
|
72 |
+
// idx: (B, npoints, nsample)
|
73 |
+
// output:
|
74 |
+
// out: (B, C, npoints, nsample)
|
75 |
+
cudaError_t err;
|
76 |
+
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
|
77 |
+
dim3 threads(THREADS_PER_BLOCK);
|
78 |
+
|
79 |
+
group_points_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, nsample, points, idx, out);
|
80 |
+
// cudaDeviceSynchronize(); // for using printf in kernel function
|
81 |
+
err = cudaGetLastError();
|
82 |
+
if (cudaSuccess != err) {
|
83 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
84 |
+
exit(-1);
|
85 |
+
}
|
86 |
+
}
|
87 |
+
|
88 |
+
|
89 |
+
__global__ void group_points_grad_kernel_stack(int B, int M, int C, int N, int nsample,
|
90 |
+
const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features) {
|
91 |
+
// :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the output from forward
|
92 |
+
// :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
|
93 |
+
// :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
|
94 |
+
// :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
|
95 |
+
// :return:
|
96 |
+
// grad_features: (N1 + N2 ..., C) gradient of the features
|
97 |
+
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
98 |
+
int sample_idx = index % nsample;
|
99 |
+
int C_idx = (index / nsample) % C;
|
100 |
+
int pt_idx = (index / nsample / C);
|
101 |
+
|
102 |
+
if (pt_idx >= M || C_idx >= C || sample_idx >= nsample) return;
|
103 |
+
|
104 |
+
int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
|
105 |
+
for (int k = 1; k < B; k++){
|
106 |
+
if (pt_idx < pt_cnt) break;
|
107 |
+
pt_cnt += idx_batch_cnt[k];
|
108 |
+
bs_idx = k;
|
109 |
+
}
|
110 |
+
|
111 |
+
int features_batch_start_idx = 0;
|
112 |
+
for (int k = 0; k < bs_idx; k++) features_batch_start_idx += features_batch_cnt[k];
|
113 |
+
|
114 |
+
grad_out += pt_idx * C * nsample + C_idx * nsample + sample_idx;
|
115 |
+
idx += pt_idx * nsample + sample_idx;
|
116 |
+
grad_features += (features_batch_start_idx + idx[0]) * C + C_idx;
|
117 |
+
|
118 |
+
atomicAdd(grad_features, grad_out[0]);
|
119 |
+
}
|
120 |
+
|
121 |
+
void group_points_grad_kernel_launcher_stack(int B, int M, int C, int N, int nsample,
|
122 |
+
const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features) {
|
123 |
+
// :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the output from forward
|
124 |
+
// :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
|
125 |
+
// :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
|
126 |
+
// :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
|
127 |
+
// :return:
|
128 |
+
// grad_features: (N1 + N2 ..., C) gradient of the features
|
129 |
+
|
130 |
+
cudaError_t err;
|
131 |
+
// dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
|
132 |
+
dim3 blocks(DIVUP(M * C * nsample, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
|
133 |
+
dim3 threads(THREADS_PER_BLOCK);
|
134 |
+
|
135 |
+
group_points_grad_kernel_stack<<<blocks, threads>>>(B, M, C, N, nsample, grad_out, idx, idx_batch_cnt, features_batch_cnt, grad_features);
|
136 |
+
|
137 |
+
err = cudaGetLastError();
|
138 |
+
if (cudaSuccess != err) {
|
139 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
140 |
+
exit(-1);
|
141 |
+
}
|
142 |
+
}
|
143 |
+
|
144 |
+
|
145 |
+
__global__ void group_points_kernel_stack(int B, int M, int C, int nsample,
|
146 |
+
const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out) {
|
147 |
+
// :param features: (N1 + N2 ..., C) tensor of features to group
|
148 |
+
// :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
|
149 |
+
// :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
|
150 |
+
// :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
|
151 |
+
// :return:
|
152 |
+
// output: (M1 + M2, C, nsample) tensor
|
153 |
+
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
154 |
+
int sample_idx = index % nsample;
|
155 |
+
int C_idx = (index / nsample) % C;
|
156 |
+
int pt_idx = (index / nsample / C);
|
157 |
+
|
158 |
+
if (pt_idx >= M || C_idx >= C || sample_idx >= nsample) return;
|
159 |
+
|
160 |
+
int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
|
161 |
+
for (int k = 1; k < B; k++){
|
162 |
+
if (pt_idx < pt_cnt) break;
|
163 |
+
pt_cnt += idx_batch_cnt[k];
|
164 |
+
bs_idx = k;
|
165 |
+
}
|
166 |
+
|
167 |
+
int features_batch_start_idx = 0;
|
168 |
+
for (int k = 0; k < bs_idx; k++) features_batch_start_idx += features_batch_cnt[k];
|
169 |
+
features += features_batch_start_idx * C;
|
170 |
+
|
171 |
+
idx += pt_idx * nsample + sample_idx;
|
172 |
+
int in_idx = idx[0] * C + C_idx;
|
173 |
+
int out_idx = pt_idx * C * nsample + C_idx * nsample + sample_idx;
|
174 |
+
|
175 |
+
out[out_idx] = features[in_idx];
|
176 |
+
}
|
177 |
+
|
178 |
+
|
179 |
+
void group_points_kernel_launcher_stack(int B, int M, int C, int nsample,
|
180 |
+
const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out) {
|
181 |
+
// :param features: (N1 + N2 ..., C) tensor of features to group
|
182 |
+
// :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
|
183 |
+
// :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
|
184 |
+
// :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
|
185 |
+
// :return:
|
186 |
+
// output: (M1 + M2, C, nsample) tensor
|
187 |
+
|
188 |
+
cudaError_t err;
|
189 |
+
dim3 blocks(DIVUP(M * C * nsample, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
|
190 |
+
dim3 threads(THREADS_PER_BLOCK);
|
191 |
+
|
192 |
+
group_points_kernel_stack<<<blocks, threads>>>(B, M, C, nsample, features, features_batch_cnt, idx, idx_batch_cnt, out);
|
193 |
+
// cudaDeviceSynchronize(); // for using printf in kernel function
|
194 |
+
err = cudaGetLastError();
|
195 |
+
if (cudaSuccess != err) {
|
196 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
197 |
+
exit(-1);
|
198 |
+
}
|
199 |
+
}
|
pc_util/src/group_points_gpu.h
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#ifndef _GROUP_POINTS_GPU_H
|
2 |
+
#define _GROUP_POINTS_GPU_H
|
3 |
+
|
4 |
+
#include <torch/serialize/tensor.h>
|
5 |
+
#include <cuda.h>
|
6 |
+
#include <cuda_runtime_api.h>
|
7 |
+
#include <vector>
|
8 |
+
|
9 |
+
|
10 |
+
int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
|
11 |
+
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
|
12 |
+
|
13 |
+
void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
|
14 |
+
const float *points, const int *idx, float *out);
|
15 |
+
|
16 |
+
int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
|
17 |
+
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
|
18 |
+
|
19 |
+
void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
|
20 |
+
const float *grad_out, const int *idx, float *grad_points);
|
21 |
+
|
22 |
+
int group_points_wrapper_stack(int B, int M, int C, int nsample,
|
23 |
+
at::Tensor features_tensor, at::Tensor features_batch_cnt_tensor,
|
24 |
+
at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor, at::Tensor out_tensor);
|
25 |
+
|
26 |
+
void group_points_kernel_launcher_stack(int B, int M, int C, int nsample,
|
27 |
+
const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out);
|
28 |
+
|
29 |
+
int group_points_grad_wrapper_stack(int B, int M, int C, int N, int nsample,
|
30 |
+
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor,
|
31 |
+
at::Tensor features_batch_cnt_tensor, at::Tensor grad_features_tensor);
|
32 |
+
|
33 |
+
void group_points_grad_kernel_launcher_stack(int B, int M, int C, int N, int nsample,
|
34 |
+
const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features);
|
35 |
+
|
36 |
+
#endif
|
pc_util/src/interpolate.cpp
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <torch/serialize/tensor.h>
|
2 |
+
#include <vector>
|
3 |
+
// #include <THC/THC.h>
|
4 |
+
#include <math.h>
|
5 |
+
#include <stdio.h>
|
6 |
+
#include <stdlib.h>
|
7 |
+
#include <cuda.h>
|
8 |
+
#include <cuda_runtime_api.h>
|
9 |
+
#include "interpolate_gpu.h"
|
10 |
+
|
11 |
+
// extern THCState *state;
|
12 |
+
|
13 |
+
#include <ATen/cuda/CUDAContext.h>
|
14 |
+
#include <ATen/cuda/CUDAEvent.h>
|
15 |
+
// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
16 |
+
|
17 |
+
#define CHECK_CUDA(x) do { \
|
18 |
+
if (!x.type().is_cuda()) { \
|
19 |
+
fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
|
20 |
+
exit(-1); \
|
21 |
+
} \
|
22 |
+
} while (0)
|
23 |
+
#define CHECK_CONTIGUOUS(x) do { \
|
24 |
+
if (!x.is_contiguous()) { \
|
25 |
+
fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
|
26 |
+
exit(-1); \
|
27 |
+
} \
|
28 |
+
} while (0)
|
29 |
+
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
|
30 |
+
|
31 |
+
|
32 |
+
void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
|
33 |
+
at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
|
34 |
+
const float *unknown = unknown_tensor.data<float>();
|
35 |
+
const float *known = known_tensor.data<float>();
|
36 |
+
float *dist2 = dist2_tensor.data<float>();
|
37 |
+
int *idx = idx_tensor.data<int>();
|
38 |
+
|
39 |
+
three_nn_kernel_launcher_fast(b, n, m, unknown, known, dist2, idx);
|
40 |
+
}
|
41 |
+
|
42 |
+
|
43 |
+
void three_interpolate_wrapper_fast(int b, int c, int m, int n,
|
44 |
+
at::Tensor points_tensor,
|
45 |
+
at::Tensor idx_tensor,
|
46 |
+
at::Tensor weight_tensor,
|
47 |
+
at::Tensor out_tensor) {
|
48 |
+
|
49 |
+
const float *points = points_tensor.data<float>();
|
50 |
+
const float *weight = weight_tensor.data<float>();
|
51 |
+
float *out = out_tensor.data<float>();
|
52 |
+
const int *idx = idx_tensor.data<int>();
|
53 |
+
|
54 |
+
|
55 |
+
three_interpolate_kernel_launcher_fast(b, c, m, n, points, idx, weight, out);
|
56 |
+
}
|
57 |
+
|
58 |
+
void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m,
|
59 |
+
at::Tensor grad_out_tensor,
|
60 |
+
at::Tensor idx_tensor,
|
61 |
+
at::Tensor weight_tensor,
|
62 |
+
at::Tensor grad_points_tensor) {
|
63 |
+
|
64 |
+
const float *grad_out = grad_out_tensor.data<float>();
|
65 |
+
const float *weight = weight_tensor.data<float>();
|
66 |
+
float *grad_points = grad_points_tensor.data<float>();
|
67 |
+
const int *idx = idx_tensor.data<int>();
|
68 |
+
|
69 |
+
three_interpolate_grad_kernel_launcher_fast(b, c, n, m, grad_out, idx, weight, grad_points);
|
70 |
+
}
|
71 |
+
|
72 |
+
|
73 |
+
void three_nn_wrapper_stack(at::Tensor unknown_tensor,
|
74 |
+
at::Tensor unknown_batch_cnt_tensor, at::Tensor known_tensor,
|
75 |
+
at::Tensor known_batch_cnt_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor){
|
76 |
+
// unknown: (N1 + N2 ..., 3)
|
77 |
+
// unknown_batch_cnt: (batch_size), [N1, N2, ...]
|
78 |
+
// known: (M1 + M2 ..., 3)
|
79 |
+
// known_batch_cnt: (batch_size), [M1, M2, ...]
|
80 |
+
// Return:
|
81 |
+
// dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
|
82 |
+
// idx: (N1 + N2 ..., 3) index of the three nearest neighbors
|
83 |
+
CHECK_INPUT(unknown_tensor);
|
84 |
+
CHECK_INPUT(unknown_batch_cnt_tensor);
|
85 |
+
CHECK_INPUT(known_tensor);
|
86 |
+
CHECK_INPUT(known_batch_cnt_tensor);
|
87 |
+
CHECK_INPUT(dist2_tensor);
|
88 |
+
CHECK_INPUT(idx_tensor);
|
89 |
+
|
90 |
+
int batch_size = unknown_batch_cnt_tensor.size(0);
|
91 |
+
int N = unknown_tensor.size(0);
|
92 |
+
int M = known_tensor.size(0);
|
93 |
+
const float *unknown = unknown_tensor.data<float>();
|
94 |
+
const int *unknown_batch_cnt = unknown_batch_cnt_tensor.data<int>();
|
95 |
+
const float *known = known_tensor.data<float>();
|
96 |
+
const int *known_batch_cnt = known_batch_cnt_tensor.data<int>();
|
97 |
+
float *dist2 = dist2_tensor.data<float>();
|
98 |
+
int *idx = idx_tensor.data<int>();
|
99 |
+
|
100 |
+
three_nn_kernel_launcher_stack(batch_size, N, M, unknown, unknown_batch_cnt, known, known_batch_cnt, dist2, idx);
|
101 |
+
}
|
102 |
+
|
103 |
+
|
104 |
+
void three_interpolate_wrapper_stack(at::Tensor features_tensor,
|
105 |
+
at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor) {
|
106 |
+
// features_tensor: (M1 + M2 ..., C)
|
107 |
+
// idx_tensor: [N1 + N2 ..., 3]
|
108 |
+
// weight_tensor: [N1 + N2 ..., 3]
|
109 |
+
// Return:
|
110 |
+
// out_tensor: (N1 + N2 ..., C)
|
111 |
+
CHECK_INPUT(features_tensor);
|
112 |
+
CHECK_INPUT(idx_tensor);
|
113 |
+
CHECK_INPUT(weight_tensor);
|
114 |
+
CHECK_INPUT(out_tensor);
|
115 |
+
|
116 |
+
int N = out_tensor.size(0);
|
117 |
+
int channels = features_tensor.size(1);
|
118 |
+
const float *features = features_tensor.data<float>();
|
119 |
+
const float *weight = weight_tensor.data<float>();
|
120 |
+
const int *idx = idx_tensor.data<int>();
|
121 |
+
float *out = out_tensor.data<float>();
|
122 |
+
|
123 |
+
three_interpolate_kernel_launcher_stack(N, channels, features, idx, weight, out);
|
124 |
+
}
|
125 |
+
|
126 |
+
|
127 |
+
void three_interpolate_grad_wrapper_stack(at::Tensor grad_out_tensor, at::Tensor idx_tensor,
|
128 |
+
at::Tensor weight_tensor, at::Tensor grad_features_tensor) {
|
129 |
+
// grad_out_tensor: (N1 + N2 ..., C)
|
130 |
+
// idx_tensor: [N1 + N2 ..., 3]
|
131 |
+
// weight_tensor: [N1 + N2 ..., 3]
|
132 |
+
// Return:
|
133 |
+
// grad_features_tensor: (M1 + M2 ..., C)
|
134 |
+
CHECK_INPUT(grad_out_tensor);
|
135 |
+
CHECK_INPUT(idx_tensor);
|
136 |
+
CHECK_INPUT(weight_tensor);
|
137 |
+
CHECK_INPUT(grad_features_tensor);
|
138 |
+
|
139 |
+
int N = grad_out_tensor.size(0);
|
140 |
+
int channels = grad_out_tensor.size(1);
|
141 |
+
const float *grad_out = grad_out_tensor.data<float>();
|
142 |
+
const float *weight = weight_tensor.data<float>();
|
143 |
+
const int *idx = idx_tensor.data<int>();
|
144 |
+
float *grad_features = grad_features_tensor.data<float>();
|
145 |
+
|
146 |
+
// printf("N=%d, channels=%d\n", N, channels);
|
147 |
+
three_interpolate_grad_kernel_launcher_stack(N, channels, grad_out, idx, weight, grad_features);
|
148 |
+
}
|
pc_util/src/interpolate_gpu.cu
ADDED
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <math.h>
|
2 |
+
#include <stdio.h>
|
3 |
+
#include <stdlib.h>
|
4 |
+
|
5 |
+
#include "cuda_utils.h"
|
6 |
+
#include "interpolate_gpu.h"
|
7 |
+
|
8 |
+
|
9 |
+
__global__ void three_nn_kernel_fast(int b, int n, int m, const float *__restrict__ unknown,
|
10 |
+
const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
|
11 |
+
// unknown: (B, N, 3)
|
12 |
+
// known: (B, M, 3)
|
13 |
+
// output:
|
14 |
+
// dist2: (B, N, 3)
|
15 |
+
// idx: (B, N, 3)
|
16 |
+
|
17 |
+
int bs_idx = blockIdx.y;
|
18 |
+
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
19 |
+
if (bs_idx >= b || pt_idx >= n) return;
|
20 |
+
|
21 |
+
unknown += bs_idx * n * 3 + pt_idx * 3;
|
22 |
+
known += bs_idx * m * 3;
|
23 |
+
dist2 += bs_idx * n * 3 + pt_idx * 3;
|
24 |
+
idx += bs_idx * n * 3 + pt_idx * 3;
|
25 |
+
|
26 |
+
float ux = unknown[0];
|
27 |
+
float uy = unknown[1];
|
28 |
+
float uz = unknown[2];
|
29 |
+
|
30 |
+
double best1 = 1e40, best2 = 1e40, best3 = 1e40;
|
31 |
+
int besti1 = 0, besti2 = 0, besti3 = 0;
|
32 |
+
for (int k = 0; k < m; ++k) {
|
33 |
+
float x = known[k * 3 + 0];
|
34 |
+
float y = known[k * 3 + 1];
|
35 |
+
float z = known[k * 3 + 2];
|
36 |
+
float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
|
37 |
+
if (d < best1) {
|
38 |
+
best3 = best2; besti3 = besti2;
|
39 |
+
best2 = best1; besti2 = besti1;
|
40 |
+
best1 = d; besti1 = k;
|
41 |
+
}
|
42 |
+
else if (d < best2) {
|
43 |
+
best3 = best2; besti3 = besti2;
|
44 |
+
best2 = d; besti2 = k;
|
45 |
+
}
|
46 |
+
else if (d < best3) {
|
47 |
+
best3 = d; besti3 = k;
|
48 |
+
}
|
49 |
+
}
|
50 |
+
dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
|
51 |
+
idx[0] = besti1; idx[1] = besti2; idx[2] = besti3;
|
52 |
+
}
|
53 |
+
|
54 |
+
|
55 |
+
void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
|
56 |
+
const float *known, float *dist2, int *idx) {
|
57 |
+
// unknown: (B, N, 3)
|
58 |
+
// known: (B, M, 3)
|
59 |
+
// output:
|
60 |
+
// dist2: (B, N, 3)
|
61 |
+
// idx: (B, N, 3)
|
62 |
+
|
63 |
+
cudaError_t err;
|
64 |
+
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); // blockIdx.x(col), blockIdx.y(row)
|
65 |
+
dim3 threads(THREADS_PER_BLOCK);
|
66 |
+
|
67 |
+
three_nn_kernel_fast<<<blocks, threads>>>(b, n, m, unknown, known, dist2, idx);
|
68 |
+
|
69 |
+
err = cudaGetLastError();
|
70 |
+
if (cudaSuccess != err) {
|
71 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
72 |
+
exit(-1);
|
73 |
+
}
|
74 |
+
}
|
75 |
+
|
76 |
+
|
77 |
+
__global__ void three_interpolate_kernel_fast(int b, int c, int m, int n, const float *__restrict__ points,
|
78 |
+
const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
|
79 |
+
// points: (B, C, M)
|
80 |
+
// idx: (B, N, 3)
|
81 |
+
// weight: (B, N, 3)
|
82 |
+
// output:
|
83 |
+
// out: (B, C, N)
|
84 |
+
|
85 |
+
int bs_idx = blockIdx.z;
|
86 |
+
int c_idx = blockIdx.y;
|
87 |
+
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
88 |
+
|
89 |
+
if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
|
90 |
+
|
91 |
+
weight += bs_idx * n * 3 + pt_idx * 3;
|
92 |
+
points += bs_idx * c * m + c_idx * m;
|
93 |
+
idx += bs_idx * n * 3 + pt_idx * 3;
|
94 |
+
out += bs_idx * c * n + c_idx * n;
|
95 |
+
|
96 |
+
out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
|
97 |
+
}
|
98 |
+
|
99 |
+
void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
|
100 |
+
const float *points, const int *idx, const float *weight, float *out) {
|
101 |
+
// points: (B, C, M)
|
102 |
+
// idx: (B, N, 3)
|
103 |
+
// weight: (B, N, 3)
|
104 |
+
// output:
|
105 |
+
// out: (B, C, N)
|
106 |
+
|
107 |
+
cudaError_t err;
|
108 |
+
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
|
109 |
+
dim3 threads(THREADS_PER_BLOCK);
|
110 |
+
three_interpolate_kernel_fast<<<blocks, threads>>>(b, c, m, n, points, idx, weight, out);
|
111 |
+
|
112 |
+
err = cudaGetLastError();
|
113 |
+
if (cudaSuccess != err) {
|
114 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
115 |
+
exit(-1);
|
116 |
+
}
|
117 |
+
}
|
118 |
+
|
119 |
+
|
120 |
+
__global__ void three_interpolate_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out,
|
121 |
+
const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) {
|
122 |
+
// grad_out: (B, C, N)
|
123 |
+
// weight: (B, N, 3)
|
124 |
+
// output:
|
125 |
+
// grad_points: (B, C, M)
|
126 |
+
|
127 |
+
int bs_idx = blockIdx.z;
|
128 |
+
int c_idx = blockIdx.y;
|
129 |
+
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
130 |
+
|
131 |
+
if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
|
132 |
+
|
133 |
+
grad_out += bs_idx * c * n + c_idx * n + pt_idx;
|
134 |
+
weight += bs_idx * n * 3 + pt_idx * 3;
|
135 |
+
grad_points += bs_idx * c * m + c_idx * m;
|
136 |
+
idx += bs_idx * n * 3 + pt_idx * 3;
|
137 |
+
|
138 |
+
|
139 |
+
atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
|
140 |
+
atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
|
141 |
+
atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
|
142 |
+
}
|
143 |
+
|
144 |
+
void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
|
145 |
+
const int *idx, const float *weight, float *grad_points) {
|
146 |
+
// grad_out: (B, C, N)
|
147 |
+
// weight: (B, N, 3)
|
148 |
+
// output:
|
149 |
+
// grad_points: (B, C, M)
|
150 |
+
|
151 |
+
cudaError_t err;
|
152 |
+
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
|
153 |
+
dim3 threads(THREADS_PER_BLOCK);
|
154 |
+
three_interpolate_grad_kernel_fast<<<blocks, threads>>>(b, c, n, m, grad_out, idx, weight, grad_points);
|
155 |
+
|
156 |
+
err = cudaGetLastError();
|
157 |
+
if (cudaSuccess != err) {
|
158 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
159 |
+
exit(-1);
|
160 |
+
}
|
161 |
+
}
|
162 |
+
|
163 |
+
|
164 |
+
__global__ void three_nn_kernel_stack(int batch_size, int N, int M, const float *unknown,
|
165 |
+
const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt,
|
166 |
+
float *dist2, int *idx) {
|
167 |
+
// unknown: (N1 + N2 ..., 3)
|
168 |
+
// unknown_batch_cnt: (batch_size), [N1, N2, ...]
|
169 |
+
// known: (M1 + M2 ..., 3)
|
170 |
+
// known_batch_cnt: (batch_size), [M1, M2, ...]
|
171 |
+
// Return:
|
172 |
+
// dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
|
173 |
+
// idx: (N1 + N2 ..., 3) index of the three nearest neighbors
|
174 |
+
|
175 |
+
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
176 |
+
if (pt_idx >= N) return;
|
177 |
+
|
178 |
+
int bs_idx = 0, pt_cnt = unknown_batch_cnt[0];
|
179 |
+
for (int k = 1; k < batch_size; k++){
|
180 |
+
if (pt_idx < pt_cnt) break;
|
181 |
+
pt_cnt += unknown_batch_cnt[k];
|
182 |
+
bs_idx = k;
|
183 |
+
}
|
184 |
+
|
185 |
+
int cur_num_known_points = known_batch_cnt[bs_idx];
|
186 |
+
|
187 |
+
int known_batch_start_idx = 0;
|
188 |
+
for (int k = 0; k < bs_idx; k++) known_batch_start_idx += known_batch_cnt[k];
|
189 |
+
|
190 |
+
known += known_batch_start_idx * 3;
|
191 |
+
unknown += pt_idx * 3;
|
192 |
+
dist2 += pt_idx * 3;
|
193 |
+
idx += pt_idx * 3;
|
194 |
+
|
195 |
+
float ux = unknown[0];
|
196 |
+
float uy = unknown[1];
|
197 |
+
float uz = unknown[2];
|
198 |
+
|
199 |
+
double best1 = 1e40, best2 = 1e40, best3 = 1e40;
|
200 |
+
int besti1 = 0, besti2 = 0, besti3 = 0;
|
201 |
+
for (int k = 0; k < cur_num_known_points; ++k) {
|
202 |
+
float x = known[k * 3 + 0];
|
203 |
+
float y = known[k * 3 + 1];
|
204 |
+
float z = known[k * 3 + 2];
|
205 |
+
float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
|
206 |
+
if (d < best1) {
|
207 |
+
best3 = best2; besti3 = besti2;
|
208 |
+
best2 = best1; besti2 = besti1;
|
209 |
+
best1 = d; besti1 = k;
|
210 |
+
}
|
211 |
+
else if (d < best2) {
|
212 |
+
best3 = best2; besti3 = besti2;
|
213 |
+
best2 = d; besti2 = k;
|
214 |
+
}
|
215 |
+
else if (d < best3) {
|
216 |
+
best3 = d; besti3 = k;
|
217 |
+
}
|
218 |
+
}
|
219 |
+
dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
|
220 |
+
idx[0] = besti1 + known_batch_start_idx;
|
221 |
+
idx[1] = besti2 + known_batch_start_idx;
|
222 |
+
idx[2] = besti3 + known_batch_start_idx;
|
223 |
+
}
|
224 |
+
|
225 |
+
|
226 |
+
void three_nn_kernel_launcher_stack(int batch_size, int N, int M, const float *unknown,
|
227 |
+
const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt,
|
228 |
+
float *dist2, int *idx) {
|
229 |
+
// unknown: (N1 + N2 ..., 3)
|
230 |
+
// unknown_batch_cnt: (batch_size), [N1, N2, ...]
|
231 |
+
// known: (M1 + M2 ..., 3)
|
232 |
+
// known_batch_cnt: (batch_size), [M1, M2, ...]
|
233 |
+
// Return:
|
234 |
+
// dist: (N1 + N2 ..., 3) l2 distance to the three nearest neighbors
|
235 |
+
// idx: (N1 + N2 ..., 3) index of the three nearest neighbors
|
236 |
+
|
237 |
+
cudaError_t err;
|
238 |
+
dim3 blocks(DIVUP(N, THREADS_PER_BLOCK)); // blockIdx.x(col), blockIdx.y(row)
|
239 |
+
dim3 threads(THREADS_PER_BLOCK);
|
240 |
+
|
241 |
+
three_nn_kernel_stack<<<blocks, threads>>>(
|
242 |
+
batch_size, N, M, unknown, unknown_batch_cnt,
|
243 |
+
known, known_batch_cnt, dist2, idx
|
244 |
+
);
|
245 |
+
|
246 |
+
err = cudaGetLastError();
|
247 |
+
if (cudaSuccess != err) {
|
248 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
249 |
+
exit(-1);
|
250 |
+
}
|
251 |
+
}
|
252 |
+
|
253 |
+
|
254 |
+
|
255 |
+
__global__ void three_interpolate_kernel_stack(int N, int channels, const float *features,
|
256 |
+
const int *idx, const float *weight, float *out) {
|
257 |
+
// features: (M1 + M2 ..., C)
|
258 |
+
// idx: [N1 + N2 ..., 3]
|
259 |
+
// weight: [N1 + N2 ..., 3]
|
260 |
+
// Return:
|
261 |
+
// out: (N1 + N2 ..., C)
|
262 |
+
|
263 |
+
int c_idx = blockIdx.y;
|
264 |
+
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
265 |
+
if (pt_idx >= N || c_idx >= channels) return;
|
266 |
+
|
267 |
+
weight += pt_idx * 3;
|
268 |
+
idx += pt_idx * 3;
|
269 |
+
out += pt_idx * channels + c_idx;
|
270 |
+
|
271 |
+
out[0] = weight[0] * features[idx[0] * channels + c_idx] +
|
272 |
+
weight[1] * features[idx[1] * channels + c_idx] +
|
273 |
+
weight[2] * features[idx[2] * channels + c_idx];
|
274 |
+
}
|
275 |
+
|
276 |
+
|
277 |
+
|
278 |
+
void three_interpolate_kernel_launcher_stack(int N, int channels,
|
279 |
+
const float *features, const int *idx, const float *weight, float *out) {
|
280 |
+
// features: (M1 + M2 ..., C)
|
281 |
+
// idx: [N1 + N2 ..., 3]
|
282 |
+
// weight: [N1 + N2 ..., 3]
|
283 |
+
// Return:
|
284 |
+
// out: (N1 + N2 ..., C)
|
285 |
+
|
286 |
+
cudaError_t err;
|
287 |
+
dim3 blocks(DIVUP(N, THREADS_PER_BLOCK), channels);
|
288 |
+
dim3 threads(THREADS_PER_BLOCK);
|
289 |
+
three_interpolate_kernel_stack<<<blocks, threads>>>(N, channels, features, idx, weight, out);
|
290 |
+
|
291 |
+
err = cudaGetLastError();
|
292 |
+
if (cudaSuccess != err) {
|
293 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
294 |
+
exit(-1);
|
295 |
+
}
|
296 |
+
}
|
297 |
+
|
298 |
+
|
299 |
+
__global__ void three_interpolate_grad_kernel_stack(int N, int channels, const float *grad_out,
|
300 |
+
const int *idx, const float *weight, float *grad_features) {
|
301 |
+
// grad_out_tensor: (N1 + N2 ..., C)
|
302 |
+
// idx_tensor: [N1 + N2 ..., 3]
|
303 |
+
// weight_tensor: [N1 + N2 ..., 3]
|
304 |
+
// Return:
|
305 |
+
// grad_features_tensor: (M1 + M2 ..., C)
|
306 |
+
|
307 |
+
int c_idx = blockIdx.y;
|
308 |
+
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
309 |
+
if (pt_idx >= N || c_idx >= channels) return;
|
310 |
+
|
311 |
+
grad_out += pt_idx * channels + c_idx;
|
312 |
+
weight += pt_idx * 3;
|
313 |
+
idx += pt_idx * 3;
|
314 |
+
|
315 |
+
// printf("pt_idx=%d, c_idx=%d, idx=(%d, %d, %d), grad_out=%f\n", pt_idx, c_idx, idx[0], idx[1], idx[2], grad_out[0]);
|
316 |
+
|
317 |
+
atomicAdd(grad_features + idx[0] * channels + c_idx, grad_out[0] * weight[0]);
|
318 |
+
atomicAdd(grad_features + idx[1] * channels + c_idx, grad_out[0] * weight[1]);
|
319 |
+
atomicAdd(grad_features + idx[2] * channels + c_idx, grad_out[0] * weight[2]);
|
320 |
+
}
|
321 |
+
|
322 |
+
|
323 |
+
void three_interpolate_grad_kernel_launcher_stack(int N, int channels, const float *grad_out,
|
324 |
+
const int *idx, const float *weight, float *grad_features) {
|
325 |
+
// grad_out_tensor: (N1 + N2 ..., C)
|
326 |
+
// idx_tensor: [N1 + N2 ..., 3]
|
327 |
+
// weight_tensor: [N1 + N2 ..., 3]
|
328 |
+
// Return:
|
329 |
+
// grad_features_tensor: (M1 + M2 ..., C)
|
330 |
+
|
331 |
+
cudaError_t err;
|
332 |
+
dim3 blocks(DIVUP(N, THREADS_PER_BLOCK), channels); // blockIdx.x(col), blockIdx.y(row)
|
333 |
+
dim3 threads(THREADS_PER_BLOCK);
|
334 |
+
three_interpolate_grad_kernel_stack<<<blocks, threads>>>(
|
335 |
+
N, channels, grad_out, idx, weight, grad_features
|
336 |
+
);
|
337 |
+
|
338 |
+
err = cudaGetLastError();
|
339 |
+
if (cudaSuccess != err) {
|
340 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
341 |
+
exit(-1);
|
342 |
+
}
|
343 |
+
}
|
pc_util/src/interpolate_gpu.h
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#ifndef _INTERPOLATE_GPU_H
|
2 |
+
#define _INTERPOLATE_GPU_H
|
3 |
+
|
4 |
+
#include <torch/serialize/tensor.h>
|
5 |
+
#include<vector>
|
6 |
+
#include <cuda.h>
|
7 |
+
#include <cuda_runtime_api.h>
|
8 |
+
|
9 |
+
|
10 |
+
void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
|
11 |
+
at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
|
12 |
+
|
13 |
+
void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
|
14 |
+
const float *known, float *dist2, int *idx);
|
15 |
+
|
16 |
+
|
17 |
+
void three_interpolate_wrapper_fast(int b, int c, int m, int n, at::Tensor points_tensor,
|
18 |
+
at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
|
19 |
+
|
20 |
+
void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
|
21 |
+
const float *points, const int *idx, const float *weight, float *out);
|
22 |
+
|
23 |
+
|
24 |
+
void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m, at::Tensor grad_out_tensor,
|
25 |
+
at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
|
26 |
+
|
27 |
+
void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
|
28 |
+
const int *idx, const float *weight, float *grad_points);
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
void three_nn_wrapper_stack(at::Tensor unknown_tensor,
|
33 |
+
at::Tensor unknown_batch_cnt_tensor, at::Tensor known_tensor,
|
34 |
+
at::Tensor known_batch_cnt_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
|
35 |
+
|
36 |
+
|
37 |
+
void three_interpolate_wrapper_stack(at::Tensor features_tensor,
|
38 |
+
at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
void three_interpolate_grad_wrapper_stack(at::Tensor grad_out_tensor, at::Tensor idx_tensor,
|
43 |
+
at::Tensor weight_tensor, at::Tensor grad_features_tensor);
|
44 |
+
|
45 |
+
|
46 |
+
void three_nn_kernel_launcher_stack(int batch_size, int N, int M, const float *unknown,
|
47 |
+
const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt,
|
48 |
+
float *dist2, int *idx);
|
49 |
+
|
50 |
+
|
51 |
+
void three_interpolate_kernel_launcher_stack(int N, int channels,
|
52 |
+
const float *features, const int *idx, const float *weight, float *out);
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
void three_interpolate_grad_kernel_launcher_stack(int N, int channels, const float *grad_out,
|
57 |
+
const int *idx, const float *weight, float *grad_features);
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
#endif
|
pc_util/src/pointnet2_api.cpp
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <torch/serialize/tensor.h>
|
2 |
+
#include <torch/extension.h>
|
3 |
+
|
4 |
+
#include "ball_query_gpu.h"
|
5 |
+
#include "group_points_gpu.h"
|
6 |
+
#include "sampling_gpu.h"
|
7 |
+
#include "interpolate_gpu.h"
|
8 |
+
#include "cluster_gpu.h"
|
9 |
+
|
10 |
+
|
11 |
+
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
12 |
+
m.def("ball_query_wrapper", &ball_query_wrapper_fast, "ball_query_wrapper_fast");
|
13 |
+
m.def("ball_center_query_wrapper", &ball_center_query_wrapper_fast, "ball_center_query_wrapper_fast");
|
14 |
+
m.def("knn_query_wrapper", &knn_query_wrapper_fast, "knn_query_wrapper_fast");
|
15 |
+
|
16 |
+
m.def("group_points_wrapper", &group_points_wrapper_fast, "group_points_wrapper_fast");
|
17 |
+
m.def("group_points_grad_wrapper", &group_points_grad_wrapper_fast, "group_points_grad_wrapper_fast");
|
18 |
+
|
19 |
+
m.def("gather_points_wrapper", &gather_points_wrapper_fast, "gather_points_wrapper_fast");
|
20 |
+
m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper_fast, "gather_points_grad_wrapper_fast");
|
21 |
+
|
22 |
+
m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper, "furthest_point_sampling_wrapper");
|
23 |
+
|
24 |
+
m.def("three_nn_wrapper", &three_nn_wrapper_fast, "three_nn_wrapper_fast");
|
25 |
+
m.def("three_interpolate_wrapper", &three_interpolate_wrapper_fast, "three_interpolate_wrapper_fast");
|
26 |
+
m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper_fast, "three_interpolate_grad_wrapper_fast");
|
27 |
+
|
28 |
+
m.def("dbscan_wrapper", &dbscan_wrapper_fast, "dbscan_wrapper_fast");
|
29 |
+
m.def("cluster_pts_wrapper", &cluster_pts_wrapper_fast, "cluster_pts_wrapper_fast");
|
30 |
+
|
31 |
+
|
32 |
+
m.def("ball_query_wrapper_stack", &ball_query_wrapper_stack, "ball_query_wrapper_stack");
|
33 |
+
|
34 |
+
m.def("group_points_wrapper_stack", &group_points_wrapper_stack, "group_points_wrapper_stack");
|
35 |
+
m.def("group_points_grad_wrapper_stack", &group_points_grad_wrapper_stack, "group_points_grad_wrapper_stack");
|
36 |
+
|
37 |
+
m.def("three_nn_wrapper_stack", &three_nn_wrapper_stack, "three_nn_wrapper_stack");
|
38 |
+
m.def("three_interpolate_wrapper_stack", &three_interpolate_wrapper_stack, "three_interpolate_wrapper_stack");
|
39 |
+
m.def("three_interpolate_grad_wrapper_stack", &three_interpolate_grad_wrapper_stack, "three_interpolate_grad_wrapper_stack");
|
40 |
+
|
41 |
+
}
|
pc_util/src/sampling.cpp
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <torch/serialize/tensor.h>
|
2 |
+
#include <ATen/cuda/CUDAContext.h>
|
3 |
+
#include <vector>
|
4 |
+
// #include <THC/THC.h>
|
5 |
+
|
6 |
+
#include "sampling_gpu.h"
|
7 |
+
|
8 |
+
// extern THCState *state;
|
9 |
+
|
10 |
+
#include <ATen/cuda/CUDAContext.h>
|
11 |
+
#include <ATen/cuda/CUDAEvent.h>
|
12 |
+
// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
13 |
+
|
14 |
+
int gather_points_wrapper_fast(int b, int c, int n, int npoints,
|
15 |
+
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor){
|
16 |
+
const float *points = points_tensor.data<float>();
|
17 |
+
const int *idx = idx_tensor.data<int>();
|
18 |
+
float *out = out_tensor.data<float>();
|
19 |
+
|
20 |
+
gather_points_kernel_launcher_fast(b, c, n, npoints, points, idx, out);
|
21 |
+
return 1;
|
22 |
+
}
|
23 |
+
|
24 |
+
|
25 |
+
int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints,
|
26 |
+
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
|
27 |
+
|
28 |
+
const float *grad_out = grad_out_tensor.data<float>();
|
29 |
+
const int *idx = idx_tensor.data<int>();
|
30 |
+
float *grad_points = grad_points_tensor.data<float>();
|
31 |
+
|
32 |
+
gather_points_grad_kernel_launcher_fast(b, c, n, npoints, grad_out, idx, grad_points);
|
33 |
+
return 1;
|
34 |
+
}
|
35 |
+
|
36 |
+
|
37 |
+
int furthest_point_sampling_wrapper(int b, int c, int n, int m, float w1, float w2,
|
38 |
+
at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) {
|
39 |
+
|
40 |
+
const float *points = points_tensor.data<float>();
|
41 |
+
float *temp = temp_tensor.data<float>();
|
42 |
+
int *idx = idx_tensor.data<int>();
|
43 |
+
|
44 |
+
furthest_point_sampling_kernel_launcher(b, c, n, m, w1, w2, points, temp, idx);
|
45 |
+
return 1;
|
46 |
+
}
|
pc_util/src/sampling_gpu.cu
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <stdio.h>
|
2 |
+
#include <stdlib.h>
|
3 |
+
|
4 |
+
#include "cuda_utils.h"
|
5 |
+
#include "sampling_gpu.h"
|
6 |
+
|
7 |
+
|
8 |
+
__global__ void gather_points_kernel_fast(int b, int c, int n, int m,
|
9 |
+
const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
|
10 |
+
// points: (B, C, N)
|
11 |
+
// idx: (B, M)
|
12 |
+
// output:
|
13 |
+
// out: (B, C, M)
|
14 |
+
|
15 |
+
int bs_idx = blockIdx.z;
|
16 |
+
int c_idx = blockIdx.y;
|
17 |
+
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
18 |
+
if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
|
19 |
+
|
20 |
+
out += bs_idx * c * m + c_idx * m + pt_idx;
|
21 |
+
idx += bs_idx * m + pt_idx;
|
22 |
+
points += bs_idx * c * n + c_idx * n;
|
23 |
+
out[0] = points[idx[0]];
|
24 |
+
}
|
25 |
+
|
26 |
+
void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
|
27 |
+
const float *points, const int *idx, float *out) {
|
28 |
+
// points: (B, C, N)
|
29 |
+
// idx: (B, npoints)
|
30 |
+
// output:
|
31 |
+
// out: (B, C, npoints)
|
32 |
+
|
33 |
+
cudaError_t err;
|
34 |
+
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
|
35 |
+
dim3 threads(THREADS_PER_BLOCK);
|
36 |
+
|
37 |
+
gather_points_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, points, idx, out);
|
38 |
+
|
39 |
+
err = cudaGetLastError();
|
40 |
+
if (cudaSuccess != err) {
|
41 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
42 |
+
exit(-1);
|
43 |
+
}
|
44 |
+
}
|
45 |
+
|
46 |
+
__global__ void gather_points_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out,
|
47 |
+
const int *__restrict__ idx, float *__restrict__ grad_points) {
|
48 |
+
// grad_out: (B, C, M)
|
49 |
+
// idx: (B, M)
|
50 |
+
// output:
|
51 |
+
// grad_points: (B, C, N)
|
52 |
+
|
53 |
+
int bs_idx = blockIdx.z;
|
54 |
+
int c_idx = blockIdx.y;
|
55 |
+
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
56 |
+
if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
|
57 |
+
|
58 |
+
grad_out += bs_idx * c * m + c_idx * m + pt_idx;
|
59 |
+
idx += bs_idx * m + pt_idx;
|
60 |
+
grad_points += bs_idx * c * n + c_idx * n;
|
61 |
+
|
62 |
+
atomicAdd(grad_points + idx[0], grad_out[0]);
|
63 |
+
}
|
64 |
+
|
65 |
+
void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
|
66 |
+
const float *grad_out, const int *idx, float *grad_points) {
|
67 |
+
// grad_out: (B, C, npoints)
|
68 |
+
// idx: (B, npoints)
|
69 |
+
// output:
|
70 |
+
// grad_points: (B, C, N)
|
71 |
+
|
72 |
+
cudaError_t err;
|
73 |
+
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b); // blockIdx.x(col), blockIdx.y(row)
|
74 |
+
dim3 threads(THREADS_PER_BLOCK);
|
75 |
+
|
76 |
+
gather_points_grad_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, grad_out, idx, grad_points);
|
77 |
+
|
78 |
+
err = cudaGetLastError();
|
79 |
+
if (cudaSuccess != err) {
|
80 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
81 |
+
exit(-1);
|
82 |
+
}
|
83 |
+
}
|
84 |
+
|
85 |
+
|
86 |
+
__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2){
|
87 |
+
const float v1 = dists[idx1], v2 = dists[idx2];
|
88 |
+
const int i1 = dists_i[idx1], i2 = dists_i[idx2];
|
89 |
+
dists[idx1] = max(v1, v2);
|
90 |
+
dists_i[idx1] = v2 > v1 ? i2 : i1;
|
91 |
+
}
|
92 |
+
|
93 |
+
template <unsigned int block_size>
|
94 |
+
__global__ void furthest_point_sampling_kernel(int b, int c, int n, int m, float w1, float w2,
|
95 |
+
const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) {
|
96 |
+
// dataset: (B, N, 3)
|
97 |
+
// tmp: (B, N)
|
98 |
+
// output:
|
99 |
+
// idx: (B, M)
|
100 |
+
|
101 |
+
if (m <= 0) return;
|
102 |
+
__shared__ float dists[block_size];
|
103 |
+
__shared__ int dists_i[block_size];
|
104 |
+
|
105 |
+
int batch_index = blockIdx.x;
|
106 |
+
dataset += batch_index * n * c;
|
107 |
+
temp += batch_index * n;
|
108 |
+
idxs += batch_index * m;
|
109 |
+
|
110 |
+
int tid = threadIdx.x;
|
111 |
+
const int stride = block_size;
|
112 |
+
|
113 |
+
int old = 0;
|
114 |
+
if (threadIdx.x == 0)
|
115 |
+
idxs[0] = old;
|
116 |
+
|
117 |
+
__syncthreads();
|
118 |
+
for (int j = 1; j < m; j++) {
|
119 |
+
int besti = 0;
|
120 |
+
float best = -1;
|
121 |
+
float x1 = dataset[old * c + 0];
|
122 |
+
float y1 = dataset[old * c + 1];
|
123 |
+
float z1 = dataset[old * c + 2];
|
124 |
+
|
125 |
+
for (int k = tid; k < n; k += stride) {
|
126 |
+
float x2, y2, z2;
|
127 |
+
x2 = dataset[k * c + 0];
|
128 |
+
y2 = dataset[k * c + 1];
|
129 |
+
z2 = dataset[k * c + 2];
|
130 |
+
// float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
|
131 |
+
// if (mag <= 1e-3)
|
132 |
+
// continue;
|
133 |
+
|
134 |
+
float xyz_d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
|
135 |
+
float fea_d = 0;
|
136 |
+
for (int l = 3; l < c; l++) {
|
137 |
+
fea_d += (dataset[old * c + l] - dataset[k * c + l]) * (dataset[old * c + l] - dataset[k * c + l]);
|
138 |
+
}
|
139 |
+
float d = w1 * xyz_d + w2 * fea_d;
|
140 |
+
float d2 = min(d, temp[k]);
|
141 |
+
temp[k] = d2;
|
142 |
+
besti = d2 > best ? k : besti;
|
143 |
+
best = d2 > best ? d2 : best;
|
144 |
+
}
|
145 |
+
dists[tid] = best;
|
146 |
+
dists_i[tid] = besti;
|
147 |
+
__syncthreads();
|
148 |
+
|
149 |
+
if (block_size >= 1024) {
|
150 |
+
if (tid < 512) {
|
151 |
+
__update(dists, dists_i, tid, tid + 512);
|
152 |
+
}
|
153 |
+
__syncthreads();
|
154 |
+
}
|
155 |
+
|
156 |
+
if (block_size >= 512) {
|
157 |
+
if (tid < 256) {
|
158 |
+
__update(dists, dists_i, tid, tid + 256);
|
159 |
+
}
|
160 |
+
__syncthreads();
|
161 |
+
}
|
162 |
+
if (block_size >= 256) {
|
163 |
+
if (tid < 128) {
|
164 |
+
__update(dists, dists_i, tid, tid + 128);
|
165 |
+
}
|
166 |
+
__syncthreads();
|
167 |
+
}
|
168 |
+
if (block_size >= 128) {
|
169 |
+
if (tid < 64) {
|
170 |
+
__update(dists, dists_i, tid, tid + 64);
|
171 |
+
}
|
172 |
+
__syncthreads();
|
173 |
+
}
|
174 |
+
if (block_size >= 64) {
|
175 |
+
if (tid < 32) {
|
176 |
+
__update(dists, dists_i, tid, tid + 32);
|
177 |
+
}
|
178 |
+
__syncthreads();
|
179 |
+
}
|
180 |
+
if (block_size >= 32) {
|
181 |
+
if (tid < 16) {
|
182 |
+
__update(dists, dists_i, tid, tid + 16);
|
183 |
+
}
|
184 |
+
__syncthreads();
|
185 |
+
}
|
186 |
+
if (block_size >= 16) {
|
187 |
+
if (tid < 8) {
|
188 |
+
__update(dists, dists_i, tid, tid + 8);
|
189 |
+
}
|
190 |
+
__syncthreads();
|
191 |
+
}
|
192 |
+
if (block_size >= 8) {
|
193 |
+
if (tid < 4) {
|
194 |
+
__update(dists, dists_i, tid, tid + 4);
|
195 |
+
}
|
196 |
+
__syncthreads();
|
197 |
+
}
|
198 |
+
if (block_size >= 4) {
|
199 |
+
if (tid < 2) {
|
200 |
+
__update(dists, dists_i, tid, tid + 2);
|
201 |
+
}
|
202 |
+
__syncthreads();
|
203 |
+
}
|
204 |
+
if (block_size >= 2) {
|
205 |
+
if (tid < 1) {
|
206 |
+
__update(dists, dists_i, tid, tid + 1);
|
207 |
+
}
|
208 |
+
__syncthreads();
|
209 |
+
}
|
210 |
+
|
211 |
+
old = dists_i[0];
|
212 |
+
if (tid == 0)
|
213 |
+
idxs[j] = old;
|
214 |
+
}
|
215 |
+
}
|
216 |
+
|
217 |
+
void furthest_point_sampling_kernel_launcher(int b, int c, int n, int m, float w1, float w2,
|
218 |
+
const float *dataset, float *temp, int *idxs) {
|
219 |
+
// dataset: (B, N, 3)
|
220 |
+
// tmp: (B, N)
|
221 |
+
// output:
|
222 |
+
// idx: (B, M)
|
223 |
+
|
224 |
+
cudaError_t err;
|
225 |
+
unsigned int n_threads = opt_n_threads(n);
|
226 |
+
|
227 |
+
switch (n_threads) {
|
228 |
+
case 1024:
|
229 |
+
furthest_point_sampling_kernel<1024><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
|
230 |
+
case 512:
|
231 |
+
furthest_point_sampling_kernel<512><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
|
232 |
+
case 256:
|
233 |
+
furthest_point_sampling_kernel<256><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
|
234 |
+
case 128:
|
235 |
+
furthest_point_sampling_kernel<128><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
|
236 |
+
case 64:
|
237 |
+
furthest_point_sampling_kernel<64><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
|
238 |
+
case 32:
|
239 |
+
furthest_point_sampling_kernel<32><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
|
240 |
+
case 16:
|
241 |
+
furthest_point_sampling_kernel<16><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
|
242 |
+
case 8:
|
243 |
+
furthest_point_sampling_kernel<8><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
|
244 |
+
case 4:
|
245 |
+
furthest_point_sampling_kernel<4><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
|
246 |
+
case 2:
|
247 |
+
furthest_point_sampling_kernel<2><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
|
248 |
+
case 1:
|
249 |
+
furthest_point_sampling_kernel<1><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
|
250 |
+
default:
|
251 |
+
furthest_point_sampling_kernel<512><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs);
|
252 |
+
}
|
253 |
+
|
254 |
+
err = cudaGetLastError();
|
255 |
+
if (cudaSuccess != err) {
|
256 |
+
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
|
257 |
+
exit(-1);
|
258 |
+
}
|
259 |
+
}
|
pc_util/src/sampling_gpu.h
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#ifndef _SAMPLING_GPU_H
|
2 |
+
#define _SAMPLING_GPU_H
|
3 |
+
|
4 |
+
#include <torch/serialize/tensor.h>
|
5 |
+
#include <ATen/cuda/CUDAContext.h>
|
6 |
+
#include<vector>
|
7 |
+
|
8 |
+
|
9 |
+
int gather_points_wrapper_fast(int b, int c, int n, int npoints,
|
10 |
+
at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
|
11 |
+
|
12 |
+
void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
|
13 |
+
const float *points, const int *idx, float *out);
|
14 |
+
|
15 |
+
|
16 |
+
int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints,
|
17 |
+
at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
|
18 |
+
|
19 |
+
void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
|
20 |
+
const float *grad_out, const int *idx, float *grad_points);
|
21 |
+
|
22 |
+
|
23 |
+
int furthest_point_sampling_wrapper(int b, int c, int n, int m, float w1, float w2,
|
24 |
+
at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
|
25 |
+
|
26 |
+
void furthest_point_sampling_kernel_launcher(int b, int c, int n, int m, float w1, float w2,
|
27 |
+
const float *dataset, float *temp, int *idxs);
|
28 |
+
|
29 |
+
#endif
|
script.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### This is example of the script that will be run in the test environment.
|
2 |
+
### Some parts of the code are compulsory and you should NOT CHANGE THEM.
|
3 |
+
### They are between '''---compulsory---''' comments.
|
4 |
+
### You can change the rest of the code to define and test your solution.
|
5 |
+
### However, you should not change the signature of the provided function.
|
6 |
+
### The script would save "submission.parquet" file in the current directory.
|
7 |
+
### The actual logic of the solution is implemented in the `handcrafted_solution.py` file.
|
8 |
+
### The `handcrafted_solution.py` file is a placeholder for your solution.
|
9 |
+
### You should implement the logic of your solution in that file.
|
10 |
+
### You can use any additional files and subdirectories to organize your code.
|
11 |
+
|
12 |
+
'''---compulsory---'''
|
13 |
+
# import subprocess
|
14 |
+
# from pathlib import Path
|
15 |
+
# def install_package_from_local_file(package_name, folder='packages'):
|
16 |
+
# """
|
17 |
+
# Installs a package from a local .whl file or a directory containing .whl files using pip.
|
18 |
+
|
19 |
+
# Parameters:
|
20 |
+
# path_to_file_or_directory (str): The path to the .whl file or the directory containing .whl files.
|
21 |
+
# """
|
22 |
+
# try:
|
23 |
+
# pth = str(Path(folder) / package_name)
|
24 |
+
# subprocess.check_call([subprocess.sys.executable, "-m", "pip", "install",
|
25 |
+
# "--no-index", # Do not use package index
|
26 |
+
# "--find-links", pth, # Look for packages in the specified directory or at the file
|
27 |
+
# package_name]) # Specify the package to install
|
28 |
+
# print(f"Package installed successfully from {pth}")
|
29 |
+
# except subprocess.CalledProcessError as e:
|
30 |
+
# print(f"Failed to install package from {pth}. Error: {e}")
|
31 |
+
|
32 |
+
# install_package_from_local_file('hoho')
|
33 |
+
|
34 |
+
import hoho; hoho.setup() # YOU MUST CALL hoho.setup() BEFORE ANYTHING ELSE
|
35 |
+
# import subprocess
|
36 |
+
# import importlib
|
37 |
+
# from pathlib import Path
|
38 |
+
# import subprocess
|
39 |
+
|
40 |
+
|
41 |
+
# ### The function below is useful for installing additional python wheels.
|
42 |
+
# def install_package_from_local_file(package_name, folder='packages'):
|
43 |
+
# """
|
44 |
+
# Installs a package from a local .whl file or a directory containing .whl files using pip.
|
45 |
+
|
46 |
+
# Parameters:
|
47 |
+
# path_to_file_or_directory (str): The path to the .whl file or the directory containing .whl files.
|
48 |
+
# """
|
49 |
+
# try:
|
50 |
+
# pth = str(Path(folder) / package_name)
|
51 |
+
# subprocess.check_call([subprocess.sys.executable, "-m", "pip", "install",
|
52 |
+
# "--no-index", # Do not use package index
|
53 |
+
# "--find-links", pth, # Look for packages in the specified directory or at the file
|
54 |
+
# package_name]) # Specify the package to install
|
55 |
+
# print(f"Package installed successfully from {pth}")
|
56 |
+
# except subprocess.CalledProcessError as e:
|
57 |
+
# print(f"Failed to install package from {pth}. Error: {e}")
|
58 |
+
|
59 |
+
|
60 |
+
# pip download webdataset -d packages/webdataset --platform manylinux1_x86_64 --python-version 38 --only-binary=:all:
|
61 |
+
# install_package_from_local_file('webdataset')
|
62 |
+
# install_package_from_local_file('tqdm')
|
63 |
+
|
64 |
+
### Here you can import any library or module you want.
|
65 |
+
### The code below is used to read and parse the input dataset.
|
66 |
+
### Please, do not modify it.
|
67 |
+
|
68 |
+
import webdataset as wds
|
69 |
+
from tqdm import tqdm
|
70 |
+
from typing import Dict
|
71 |
+
import pandas as pd
|
72 |
+
from transformers import AutoTokenizer
|
73 |
+
import os
|
74 |
+
import time
|
75 |
+
import io
|
76 |
+
from PIL import Image as PImage
|
77 |
+
import numpy as np
|
78 |
+
|
79 |
+
from hoho.read_write_colmap import read_cameras_binary, read_images_binary, read_points3D_binary
|
80 |
+
from hoho import proc, Sample
|
81 |
+
|
82 |
+
def convert_entry_to_human_readable(entry):
|
83 |
+
out = {}
|
84 |
+
already_good = ['__key__', 'wf_vertices', 'wf_edges', 'edge_semantics', 'mesh_vertices', 'mesh_faces', 'face_semantics', 'K', 'R', 't']
|
85 |
+
for k, v in entry.items():
|
86 |
+
if k in already_good:
|
87 |
+
out[k] = v
|
88 |
+
continue
|
89 |
+
if k == 'points3d':
|
90 |
+
out[k] = read_points3D_binary(fid=io.BytesIO(v))
|
91 |
+
if k == 'cameras':
|
92 |
+
out[k] = read_cameras_binary(fid=io.BytesIO(v))
|
93 |
+
if k == 'images':
|
94 |
+
out[k] = read_images_binary(fid=io.BytesIO(v))
|
95 |
+
if k in ['ade20k', 'gestalt']:
|
96 |
+
out[k] = [PImage.open(io.BytesIO(x)).convert('RGB') for x in v]
|
97 |
+
if k == 'depthcm':
|
98 |
+
out[k] = [PImage.open(io.BytesIO(x)) for x in entry['depthcm']]
|
99 |
+
return out
|
100 |
+
|
101 |
+
'''---end of compulsory---'''
|
102 |
+
|
103 |
+
### The part below is used to define and test your solution.
|
104 |
+
import subprocess
|
105 |
+
import sys
|
106 |
+
import os
|
107 |
+
|
108 |
+
import numpy as np
|
109 |
+
os.environ['MKL_THREADING_LAYER'] = 'GNU'
|
110 |
+
os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
|
111 |
+
|
112 |
+
def uninstall_package(package_name):
|
113 |
+
"""
|
114 |
+
Uninstalls a package using pip.
|
115 |
+
|
116 |
+
Parameters:
|
117 |
+
package_name (str): The name of the package to uninstall.
|
118 |
+
"""
|
119 |
+
try:
|
120 |
+
subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", package_name])
|
121 |
+
print(f"Package {package_name} uninstalled successfully")
|
122 |
+
except subprocess.CalledProcessError as e:
|
123 |
+
print(f"Failed to uninstall package {package_name}. Error: {e}")
|
124 |
+
|
125 |
+
# def download_packages(packages, folder='packages/torch'):
|
126 |
+
# """
|
127 |
+
# Downloads packages as .whl files into the specified folder using pip.
|
128 |
+
|
129 |
+
# Parameters:
|
130 |
+
# packages (list): List of packages to download with versions.
|
131 |
+
# folder (str): The folder where the .whl files will be saved.
|
132 |
+
# """
|
133 |
+
# Path(folder).mkdir(parents=True, exist_ok=True)
|
134 |
+
# try:
|
135 |
+
# subprocess.check_call([sys.executable, "-m", "pip", "download",
|
136 |
+
# "--platform", "manylinux1_x86_64",
|
137 |
+
# "--python-version", "38",
|
138 |
+
# "--only-binary=:all:",
|
139 |
+
# "-d", folder] + packages)
|
140 |
+
# print(f"Packages downloaded successfully into {folder}")
|
141 |
+
# except subprocess.CalledProcessError as e:
|
142 |
+
# print(f"Failed to download packages. Error: {e}")
|
143 |
+
|
144 |
+
def download_packages(packages, folder):
|
145 |
+
# Create the directory if it doesn't exist
|
146 |
+
if not os.path.exists(folder):
|
147 |
+
os.makedirs(folder)
|
148 |
+
|
149 |
+
try:
|
150 |
+
subprocess.check_call([
|
151 |
+
'pip', 'download',
|
152 |
+
'--dest', folder,
|
153 |
+
'-f', 'https://download.pytorch.org/whl/cu121'
|
154 |
+
] + packages)
|
155 |
+
print(f"Packages downloaded successfully to {folder}")
|
156 |
+
except subprocess.CalledProcessError as e:
|
157 |
+
print(f"Failed to download packages. Error: {e}")
|
158 |
+
|
159 |
+
# Set CUDA environment variables
|
160 |
+
os.environ['CUDA_HOME'] = '/usr/local/cuda-12.1'
|
161 |
+
os.environ['PATH'] = os.environ['CUDA_HOME'] + '/bin:' + os.environ['PATH']
|
162 |
+
os.environ['LD_LIBRARY_PATH'] = os.environ['CUDA_HOME'] + '/lib64:' + os.environ.get('LD_LIBRARY_PATH', '')
|
163 |
+
|
164 |
+
def install_package_from_local_file(package_name, folder='packages'):
|
165 |
+
"""
|
166 |
+
Installs a package from a local .whl file or a directory containing .whl files using pip.
|
167 |
+
|
168 |
+
Parameters:
|
169 |
+
package_name (str): The name of the package to install.
|
170 |
+
folder (str): The folder where the .whl files are located.
|
171 |
+
"""
|
172 |
+
try:
|
173 |
+
pth = str(Path(folder) / package_name)
|
174 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install",
|
175 |
+
"--no-index", # Do not use package index
|
176 |
+
"--find-links", pth, # Look for packages in the specified directory or at the file
|
177 |
+
package_name]) # Specify the package to install
|
178 |
+
print(f"Package installed successfully from {pth}")
|
179 |
+
except subprocess.CalledProcessError as e:
|
180 |
+
print(f"Failed to install package from {pth}. Error: {e}")
|
181 |
+
|
182 |
+
def install_which():
|
183 |
+
try:
|
184 |
+
# Attempt to install which if it's not available
|
185 |
+
subprocess.check_call(['sudo', 'apt-get', 'install', '-y', 'which'])
|
186 |
+
print("Which installed successfully.")
|
187 |
+
except subprocess.CalledProcessError as e:
|
188 |
+
print(f"An error occurred while installing which: {e}")
|
189 |
+
sys.exit(1)
|
190 |
+
|
191 |
+
def setup_environment():
|
192 |
+
# Uninstall torch if it is already installed
|
193 |
+
# packages_to_uninstall = ['torch', 'torchvision', 'torchaudio']
|
194 |
+
# for package in packages_to_uninstall:
|
195 |
+
# uninstall_package(package)
|
196 |
+
# Download required packages
|
197 |
+
# pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
|
198 |
+
# pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121
|
199 |
+
# pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121
|
200 |
+
# packages_to_download = ['torch==1.13.1', 'torchvision==0.14.1', 'torchaudio==0.13.1']
|
201 |
+
# packages_to_download = ['torch==2.1.0', 'torchvision==0.16.0', 'torchaudio==2.1.0']
|
202 |
+
# download_packages(packages_to_download, folder='packages/torch')
|
203 |
+
|
204 |
+
# Install ninja
|
205 |
+
# install_package_from_local_file('ninja', folder='packages')
|
206 |
+
|
207 |
+
# packages_to_download = ['torch==2.1.0', 'torchvision==0.16.0', 'torchaudio==2.1.0']
|
208 |
+
# download_folder = 'packages/torch'
|
209 |
+
|
210 |
+
# Download the packages
|
211 |
+
# download_packages(packages_to_download, download_folder)
|
212 |
+
|
213 |
+
# Install packages from local files
|
214 |
+
# install_package_from_local_file('torch', folder='packages')
|
215 |
+
# install_package_from_local_file('packages/torch/torchvision-0.16.0-cp38-cp38-manylinux1_x86_64.whl', folder='packages/torch')
|
216 |
+
# install_package_from_local_file('packages/torch/torchaudio-2.1.0-cp38-cp38-manylinux1_x86_64.whl', folder='packages/torch')
|
217 |
+
# install_package_from_local_file('scikit-learn', folder='packages')
|
218 |
+
# install_package_from_local_file('open3d', folder='packages')
|
219 |
+
# install_package_from_local_file('easydict', folder='packages')
|
220 |
+
# install_package_from_local_file('setuptools', folder='packages')
|
221 |
+
# install_package_from_local_file('ninja', folder='packages')
|
222 |
+
# download_packages(['scikit-learn'], folder='packages/scikit-learn')
|
223 |
+
# download_packages(['open3d'], folder='packages/open3d')
|
224 |
+
# download_packages(['easydict'], folder='packages/easydict')
|
225 |
+
|
226 |
+
# try:
|
227 |
+
# subprocess.check_call(['which', 'which'])
|
228 |
+
# except subprocess.CalledProcessError:
|
229 |
+
# install_which()
|
230 |
+
|
231 |
+
pc_util_path = os.path.join(os.getcwd(), 'pc_util')
|
232 |
+
if os.path.isdir(pc_util_path):
|
233 |
+
os.chdir(pc_util_path)
|
234 |
+
subprocess.check_call([sys.executable, "setup.py", "install"], cwd=pc_util_path)
|
235 |
+
os.chdir("..")
|
236 |
+
|
237 |
+
def setup_cuda_environment():
|
238 |
+
# cuda_home = '/usr/local/cuda'
|
239 |
+
# if not os.path.exists(cuda_home):
|
240 |
+
# raise EnvironmentError(f"CUDA_HOME directory {cuda_home} does not exist. Please install CUDA and set CUDA_HOME environment variable.")
|
241 |
+
# os.environ['CUDA_HOME'] = cuda_home
|
242 |
+
# os.environ['PATH'] = f"{cuda_home}/bin:{os.environ['PATH']}"
|
243 |
+
# os.environ['LD_LIBRARY_PATH'] = f"{cuda_home}/lib64:{os.environ.get('LD_LIBRARY_PATH', '')}"
|
244 |
+
|
245 |
+
os.environ['PATH'] = '/usr/local/cuda/bin'
|
246 |
+
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda/lib64'
|
247 |
+
os.environ['LIBRARY_PATH'] = '/usr/local/cuda/lib64'
|
248 |
+
|
249 |
+
# usr_local_contents = os.listdir('/usr/local')
|
250 |
+
# # print("Items under /usr/local:")
|
251 |
+
# for item in usr_local_contents:
|
252 |
+
# print(item)
|
253 |
+
|
254 |
+
from pathlib import Path
|
255 |
+
def save_submission(submission, path):
|
256 |
+
"""
|
257 |
+
Saves the submission to a specified path.
|
258 |
+
|
259 |
+
Parameters:
|
260 |
+
submission (List[Dict[]]): The submission to save.
|
261 |
+
path (str): The path to save the submission to.
|
262 |
+
"""
|
263 |
+
sub = pd.DataFrame(submission, columns=["__key__", "wf_vertices", "wf_edges"])
|
264 |
+
sub.to_parquet(path)
|
265 |
+
print(f"Submission saved to {path}")
|
266 |
+
|
267 |
+
if __name__ == "__main__":
|
268 |
+
# setup_cuda_environment()
|
269 |
+
setup_environment()
|
270 |
+
|
271 |
+
from handcrafted_solution import predict
|
272 |
+
print ("------------ Loading dataset------------ ")
|
273 |
+
params = hoho.get_params()
|
274 |
+
dataset = hoho.get_dataset(decode=None, split='all', dataset_type='webdataset')
|
275 |
+
|
276 |
+
print('------------ Now you can do your solution ---------------')
|
277 |
+
solution = []
|
278 |
+
from concurrent.futures import ProcessPoolExecutor
|
279 |
+
with ProcessPoolExecutor(max_workers=8) as pool:
|
280 |
+
results = []
|
281 |
+
for i, sample in enumerate(tqdm(dataset)):
|
282 |
+
results.append(pool.submit(predict, sample, visualize=False))
|
283 |
+
|
284 |
+
for i, result in enumerate(tqdm(results)):
|
285 |
+
key, pred_vertices, pred_edges = result.result()
|
286 |
+
solution.append({
|
287 |
+
'__key__': key,
|
288 |
+
'wf_vertices': pred_vertices.tolist(),
|
289 |
+
'wf_edges': pred_edges
|
290 |
+
})
|
291 |
+
if i % 100 == 0:
|
292 |
+
# incrementally save the results in case we run out of time
|
293 |
+
print(f"Processed {i} samples")
|
294 |
+
# save_submission(solution, Path(params['output_path']) / "submission.parquet")
|
295 |
+
print('------------ Saving results ---------------')
|
296 |
+
save_submission(solution, Path(params['output_path']) / "submission.parquet")
|
297 |
+
print("------------ Done ------------ ")
|