colin1842 commited on Jun 3, 2024

Commit

1c849a8

verified ·

1 Parent(s): c482d53

Upload 23 files

Browse files

Files changed (24) hide show

.gitattributes +1 -0
pc_util/dist/pc_util-1.0-py3.8-linux-x86_64.egg +3 -0
pc_util/pc_util.egg-info/PKG-INFO +3 -0
pc_util/pc_util.egg-info/SOURCES.txt +16 -0
pc_util/pc_util.egg-info/dependency_links.txt +1 -0
pc_util/pc_util.egg-info/top_level.txt +1 -0
pc_util/setup.py +23 -0
pc_util/src/ball_query.cpp +84 -0
pc_util/src/ball_query_gpu.cu +270 -0
pc_util/src/ball_query_gpu.h +38 -0
pc_util/src/cluster.cpp +50 -0
pc_util/src/cluster_gpu.cu +192 -0
pc_util/src/cluster_gpu.h +34 -0
pc_util/src/cuda_utils.h +15 -0
pc_util/src/group_points.cpp +98 -0
pc_util/src/group_points_gpu.cu +199 -0
pc_util/src/group_points_gpu.h +36 -0
pc_util/src/interpolate.cpp +148 -0
pc_util/src/interpolate_gpu.cu +343 -0
pc_util/src/interpolate_gpu.h +61 -0
pc_util/src/pointnet2_api.cpp +41 -0
pc_util/src/sampling.cpp +46 -0
pc_util/src/sampling_gpu.cu +259 -0
pc_util/src/sampling_gpu.h +29 -0

.gitattributes CHANGED Viewed

@@ -56,3 +56,4 @@ packages/open3d/widgetsnbextension-4.0.11-py3-none-any.whl filter=lfs diff=lfs m
 packages/scikit-learn/numpy-1.21.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl filter=lfs diff=lfs merge=lfs -text
 packages/scikit-learn/scikit_learn-1.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl filter=lfs diff=lfs merge=lfs -text
 packages/scikit-learn/scipy-1.7.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl filter=lfs diff=lfs merge=lfs -text

 packages/scikit-learn/numpy-1.21.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl filter=lfs diff=lfs merge=lfs -text
 packages/scikit-learn/scikit_learn-1.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl filter=lfs diff=lfs merge=lfs -text
 packages/scikit-learn/scipy-1.7.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+pc_util/dist/pc_util-1.0-py3.8-linux-x86_64.egg filter=lfs diff=lfs merge=lfs -text

pc_util/dist/pc_util-1.0-py3.8-linux-x86_64.egg ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:094a37bbfdc368a602ba81c1407b7358178f2ca6253e471ffce0b0ecee469224
+size 4613678

pc_util/pc_util.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,3 @@

+Metadata-Version: 2.1
+Name: pc_util
+Version: 1.0

pc_util/pc_util.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+setup.py
+pc_util.egg-info/PKG-INFO
+pc_util.egg-info/SOURCES.txt
+pc_util.egg-info/dependency_links.txt
+pc_util.egg-info/top_level.txt
+src/ball_query.cpp
+src/ball_query_gpu.cu
+src/cluster.cpp
+src/cluster_gpu.cu
+src/group_points.cpp
+src/group_points_gpu.cu
+src/interpolate.cpp
+src/interpolate_gpu.cu
+src/pointnet2_api.cpp
+src/sampling.cpp
+src/sampling_gpu.cu

pc_util/pc_util.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

pc_util/pc_util.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ pc_util

pc_util/setup.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+setup(
+    name='pc_util',
+    version='1.0',
+    ext_modules=[
+        CUDAExtension('pc_util', [
+            'src/pointnet2_api.cpp',
+            'src/ball_query.cpp',
+            'src/ball_query_gpu.cu',
+            'src/group_points.cpp',
+            'src/group_points_gpu.cu',
+            'src/interpolate.cpp',
+            'src/interpolate_gpu.cu',
+            'src/sampling.cpp',
+            'src/sampling_gpu.cu',
+            'src/cluster.cpp',
+            'src/cluster_gpu.cu',
+        ], extra_compile_args={'cxx': ['-g'], 'nvcc': ['-O2']})
+    ],
+    cmdclass={'build_ext': BuildExtension}
+)

pc_util/src/ball_query.cpp ADDED Viewed

	@@ -0,0 +1,84 @@

+#include <torch/serialize/tensor.h>
+#include <vector>
+// #include <THC/THC.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "ball_query_gpu.h"
+// extern THCState *state;
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+#define CHECK_CUDA(x) do { \
+	  if (!x.type().is_cuda()) { \
+		      fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_CONTIGUOUS(x) do { \
+	  if (!x.is_contiguous()) { \
+		      fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
+    at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
+    CHECK_INPUT(new_xyz_tensor);
+    CHECK_INPUT(xyz_tensor);
+    const float *new_xyz = new_xyz_tensor.data<float>();
+    const float *xyz = xyz_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    ball_query_kernel_launcher_fast(b, n, m, radius, nsample, new_xyz, xyz, idx);
+    return 1;
+}
+int ball_center_query_wrapper_fast(int b, int n, int m, float radius,
+    at::Tensor point_tensor, at::Tensor key_point_tensor, at::Tensor idx_tensor) {
+    CHECK_INPUT(point_tensor);
+    CHECK_INPUT(key_point_tensor);
+    const float *point = point_tensor.data<float>();
+    const float *key_point = key_point_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    ball_center_query_kernel_launcher_fast(b, n, m, radius, point, key_point, idx);
+    return 1;
+}
+int knn_query_wrapper_fast(int b, int n, int m, int nsample,
+    at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
+    CHECK_INPUT(new_xyz_tensor);
+    CHECK_INPUT(xyz_tensor);
+    const float *new_xyz = new_xyz_tensor.data<float>();
+    const float *xyz = xyz_tensor.data<float>();
+    float *dist2 = dist2_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    knn_query_kernel_launcher_fast(b, n, m, nsample, new_xyz, xyz, dist2, idx);
+    return 1;
+}
+int ball_query_wrapper_stack(int B, int M, float radius, int nsample,
+    at::Tensor new_xyz_tensor, at::Tensor new_xyz_batch_cnt_tensor,
+    at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor, at::Tensor idx_tensor) {
+    CHECK_INPUT(new_xyz_tensor);
+    CHECK_INPUT(xyz_tensor);
+    CHECK_INPUT(new_xyz_batch_cnt_tensor);
+    CHECK_INPUT(xyz_batch_cnt_tensor);
+    const float *new_xyz = new_xyz_tensor.data<float>();
+    const float *xyz = xyz_tensor.data<float>();
+    const int *new_xyz_batch_cnt = new_xyz_batch_cnt_tensor.data<int>();
+    const int *xyz_batch_cnt = xyz_batch_cnt_tensor.data<int>();
+    int *idx = idx_tensor.data<int>();
+    ball_query_kernel_launcher_stack(B, M, radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
+    return 1;
+}

pc_util/src/ball_query_gpu.cu ADDED Viewed

	@@ -0,0 +1,270 @@

+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "ball_query_gpu.h"
+#include "cuda_utils.h"
+__global__ void ball_query_kernel_fast(int b, int n, int m, float radius, int nsample,
+    const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) {
+    // new_xyz: (B, M, 3)
+    // xyz: (B, N, 3)
+    // output:
+    //      idx: (B, M, nsample)
+    int bs_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || pt_idx >= m) return;
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+    float radius2 = radius * radius;
+    float new_x = new_xyz[0];
+    float new_y = new_xyz[1];
+    float new_z = new_xyz[2];
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+        float x = xyz[k * 3 + 0];
+        float y = xyz[k * 3 + 1];
+        float z = xyz[k * 3 + 2];
+        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
+        if (d2 < radius2){
+            if (cnt == 0){
+                for (int l = 0; l < nsample; ++l) {
+                    idx[l] = k;
+                }
+            }
+            idx[cnt] = k;
+            ++cnt;
+            if (cnt >= nsample) break;
+        }
+    }
+}
+void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, \
+    const float *new_xyz, const float *xyz, int *idx) {
+    // new_xyz: (B, M, 3)
+    // xyz: (B, N, 3)
+    // output:
+    //      idx: (B, M, nsample)
+    cudaError_t err;
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    ball_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, nsample, new_xyz, xyz, idx);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void ball_center_query_kernel_fast(int b, int n, int m, float radius, \
+    const float *__restrict__ point, const float *__restrict__ key_point, int *__restrict__ idx) {
+    // key_point: (B, M, 3)
+    // point: (B, N, 3)
+    // output:
+    //      idx: (B, N)
+    int bs_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || pt_idx >= n) return;
+    point += bs_idx * n * 3 + pt_idx * 3;
+    key_point += bs_idx * m * 3;
+    idx += bs_idx * n + pt_idx;
+    float radius2 = radius * radius;
+    float point_x = point[0];
+    float point_y = point[1];
+    float point_z = point[2];
+    float bestd = 1e8;
+    for (int k = 0; k < m; ++k) {
+        float x = key_point[k * 3 + 0];
+        float y = key_point[k * 3 + 1];
+        float z = key_point[k * 3 + 2];
+        if (((x + 1) * (x + 1) + (y + 1) * (y + 1) + (z + 1) * (z + 1)) < 1e-4) break;
+        float d2 = (point_x - x) * (point_x - x) + (point_y - y) * (point_y - y) + (point_z - z) * (point_z - z);
+        if (d2 < radius2 && d2 < bestd){
+            idx[0] = k;
+            bestd = d2;
+        }
+    }
+}
+void ball_center_query_kernel_launcher_fast(int b, int n, int m, float radius, \
+    const float *point, const float *key_point, int *idx) {
+    // point: (B, n, 3)
+    // key_point: (B, m, 3)
+    // output:
+    //      idx: (B, n)
+    cudaError_t err;
+    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    ball_center_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, point, key_point, idx);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void knn_query_kernel_fast(int b, int n, int m, int nsample, const float *__restrict__ new_xyz,
+    const float *__restrict__ xyz, float *__restrict__ dist2, int *__restrict__ idx) {
+    // new_xyz: (B, M, 3)
+    // xyz: (B, N, 3)
+    // output:
+    //      dist2: (B, M, nsample)
+    //      idx: (B, M, nsample)
+    int bs_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || pt_idx >= m) return;
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    dist2 += bs_idx * m * nsample + pt_idx * nsample;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+    float nx = new_xyz[0];
+    float ny = new_xyz[1];
+    float nz = new_xyz[2];
+    for (int i = 0; i < n; ++i) {
+        float x = xyz[i * 3 + 0];
+        float y = xyz[i * 3 + 1];
+        float z = xyz[i * 3 + 2];
+        float d2 = (nx - x) * (nx - x) + (ny - y) * (ny - y) + (nz - z) * (nz - z);
+        if (d2 < dist2[nsample - 1]) {
+            dist2[nsample - 1] = d2;
+            idx[nsample - 1] = i;
+            for (int j = nsample - 2; j >= 0; j--) {
+                if (d2 < dist2[j]){
+                    dist2[j + 1] = dist2[j];
+                    dist2[j] = d2;
+                    idx[j + 1] = idx[j];
+                    idx[j] = i;
+                }
+            }
+        }
+    }
+}
+void knn_query_kernel_launcher_fast(int b, int n, int m, int nsample, \
+    const float *new_xyz, const float *xyz, float *dist2, int *idx) {
+    cudaError_t err;
+    dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    knn_query_kernel_fast<<<blocks, threads>>>(b, n, m, nsample, new_xyz, xyz, dist2, idx);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void ball_query_kernel_stack(int B, int M, float radius, int nsample, \
+    const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx) {
+    // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
+    // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
+    // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
+    // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
+    // output:
+    //      idx: (M, nsample)
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= M) return;
+    int bs_idx = 0, pt_cnt = new_xyz_batch_cnt[0];
+    for (int k = 1; k < B; k++){
+        if (pt_idx < pt_cnt) break;
+        pt_cnt += new_xyz_batch_cnt[k];
+        bs_idx = k;
+    }
+    int xyz_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];
+    // for (int k = 0; k < bs_idx; k++) new_xyz_batch_start_idx += new_xyz_batch_cnt[k];
+    new_xyz += pt_idx * 3;
+    xyz += xyz_batch_start_idx * 3;
+    idx += pt_idx * nsample;
+    float radius2 = radius * radius;
+    float new_x = new_xyz[0];
+    float new_y = new_xyz[1];
+    float new_z = new_xyz[2];
+    int n = xyz_batch_cnt[bs_idx];
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+        float x = xyz[k * 3 + 0];
+        float y = xyz[k * 3 + 1];
+        float z = xyz[k * 3 + 2];
+        float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z);
+        if (d2 < radius2){
+            if (cnt == 0){
+                for (int l = 0; l < nsample; ++l) {
+                    idx[l] = k;
+                }
+            }
+            idx[cnt] = k;
+            ++cnt;
+            if (cnt >= nsample) break;
+        }
+    }
+    if (cnt == 0) idx[0] = -1;
+}
+void ball_query_kernel_launcher_stack(int B, int M, float radius, int nsample,
+    const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx){
+    // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
+    // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
+    // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
+    // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
+    // output:
+    //      idx: (M, nsample)
+    cudaError_t err;
+    dim3 blocks(DIVUP(M, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    ball_query_kernel_stack<<<blocks, threads>>>(B, M, radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}

pc_util/src/ball_query_gpu.h ADDED Viewed

	@@ -0,0 +1,38 @@

+#ifndef _BALL_QUERY_GPU_H
+#define _BALL_QUERY_GPU_H
+#include <torch/serialize/tensor.h>
+#include <vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+int ball_query_wrapper_fast(int b, int n, int m, float radius, int nsample,
+	at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor idx_tensor);
+void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample,
+	const float *new_xyz, const float *xyz, int *idx);
+int ball_center_query_wrapper_fast(int b, int n, int m, float radius,
+    at::Tensor point_tensor, at::Tensor key_point_tensor, at::Tensor idx_tensor);
+void ball_center_query_kernel_launcher_fast(int b, int n, int m, float radius,
+    const float *point, const float *key_point, int *idx);
+int knn_query_wrapper_fast(int b, int n, int m, int nsample,
+	at::Tensor new_xyz_tensor, at::Tensor xyz_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
+void knn_query_kernel_launcher_fast(int b, int n, int m, int nsample,
+	const float *new_xyz, const float *xyz, float *dist2, int *idx);
+int ball_query_wrapper_stack(int B, int M, float radius, int nsample,
+    at::Tensor new_xyz_tensor, at::Tensor new_xyz_batch_cnt_tensor,
+    at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor, at::Tensor idx_tensor);
+void ball_query_kernel_launcher_stack(int B, int M, float radius, int nsample,
+    const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx);
+#endif

pc_util/src/cluster.cpp ADDED Viewed

	@@ -0,0 +1,50 @@

+#include <torch/serialize/tensor.h>
+#include <vector>
+// #include <THC/THC.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "cluster_gpu.h"
+// extern THCState *state;
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+#define CHECK_CUDA(x) do { \
+	  if (!x.type().is_cuda()) { \
+		      fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_CONTIGUOUS(x) do { \
+	  if (!x.is_contiguous()) { \
+		      fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+int dbscan_wrapper_fast(int b, int n, float eps, int min_pts, at::Tensor xyz_tensor, at::Tensor idx_tensor) {
+    CHECK_INPUT(xyz_tensor);
+    const float *xyz = xyz_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    dbscan_kernel_launcher_fast(b, n, eps, min_pts, xyz, idx);
+    return 1;
+}
+int cluster_pts_wrapper_fast(int b, int n, int m, at::Tensor xyz_tensor, at::Tensor idx_tensor,
+    at::Tensor new_xyz_tensor, at::Tensor num_tensor) {
+    CHECK_INPUT(xyz_tensor);
+    CHECK_INPUT(idx_tensor);
+    const float *xyz = xyz_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *new_xyz = new_xyz_tensor.data<float>();
+    int *num = num_tensor.data<int>();
+    cluster_pts_kernel_launcher_fast(b, n, m, xyz, idx, new_xyz, num);
+    return 1;
+}

pc_util/src/cluster_gpu.cu ADDED Viewed

	@@ -0,0 +1,192 @@

+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "cluster_gpu.h"
+#include "cuda_utils.h"
+__device__ float get_dis(float x1, float y1, float z1, float x2, float y2, float z2) {
+	float dis = (x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2) + (z1 - z2) * (z1 - z2);
+	return sqrt(dis);
+}
+/*
+__device__ void dfs (int i, int c, int n, int min_pts, const int* pts_cnt, const int* pts_adj, int* idx, int label) {
+    idx[i] = c;
+    if(pts_cnt[i] < min_pts) return;
+    for(int j=0;j<n;j++) {
+        int adj = pts_adj[i * n + j];
+        printf("%d   %d     %d\n", i * n, i * n + j, adj);
+        if (adj == -1) break;
+        if (idx[adj] == -1)
+            dfs(adj, c, n, min_pts, pts_cnt, pts_adj, idx, label);
+    }
+}
+*/
+__global__ void dbscan_kernel_fast(int b, int n, float eps, int min_pts, const float *__restrict__ xyz, int *__restrict__ idx,
+    int *__restrict__ pts_cnt, int *__restrict__ pts_adj, int *__restrict__ pts_stack) {
+    // xyz: (B, N, 3)
+    // output:
+    //      idx: (B, N)
+    int bs_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b) return;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * n;
+    pts_cnt += bs_idx * n;
+    pts_stack += bs_idx * n;
+    pts_adj += bs_idx * n * n;
+    for(int i=0;i<n;i++) {
+        pts_cnt[i] = 0;
+        for(int j=0;j<n;j++) {
+            pts_adj[i * n + j] = -1;
+            if(i==j) continue;
+            float x1 = xyz[i * 3 + 0];
+            float y1 = xyz[i * 3 + 1];
+            float z1 = xyz[i * 3 + 2];
+            float x2 = xyz[j * 3 + 0];
+            float y2 = xyz[j * 3 + 1];
+            float z2 = xyz[j * 3 + 2];
+            if(get_dis(x2, y2, z2, -10.0, -10.0, -10.0) < 1e-3) continue;
+            if(get_dis(x1, y1, z1, x2, y2, z2) <= eps) {
+            pts_adj[i * n + pts_cnt[i]] = j;
+                pts_cnt[i] += 1;
+            }
+        }
+    }
+    int cluster_idx = 0;
+    for(int i=0;i<n;i++) {
+        if(idx[i] != -1) continue;
+        if(pts_cnt[i] >= min_pts) {
+            for(int j=0;j<n;j++)
+                pts_stack[j] = -1;
+            pts_stack[0] = i;
+            int stack_idx = 0;
+            int stack_len = 1;
+            while (stack_idx < n && pts_stack[stack_idx] != -1)
+            {
+                int pts_idx = pts_stack[stack_idx];
+                idx[pts_idx] = cluster_idx;
+                if(pts_cnt[pts_idx] < min_pts){
+                    stack_idx += 1;
+                    continue;
+                }
+                for(int j=0;j<n;j++) {
+                    int adj = pts_adj[pts_idx * n + j];
+                    if (adj == -1) break;
+                    if (idx[adj] == -1)
+                    {
+                        idx[adj] = -2;
+                        pts_stack[stack_len++] = adj;
+                    }
+                }
+                stack_idx += 1;
+            }
+            cluster_idx += 1;
+        }
+    }
+}
+void dbscan_kernel_launcher_fast(int b, int n, float eps, int min_pts, const float *xyz, int *idx) {
+    // xyz: (B, N, 3)
+    // output:
+    //      idx: (B, N)
+    cudaError_t err;
+    dim3 blocks(DIVUP(b, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    int* pts_cnt;
+    int* pts_stack;
+	int* pts_adj;
+	err = cudaMalloc((void**)&pts_cnt, b * n * sizeof(int));
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+    err = cudaMalloc((void**)&pts_stack, b * n * sizeof(int));
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+    err = cudaMalloc((void**)&pts_adj, b * n * n * sizeof(int));
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+    dbscan_kernel_fast<<<blocks, threads>>>(b, n, eps, min_pts, xyz, idx, pts_cnt, pts_adj, pts_stack);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    cudaFree(pts_cnt);
+    cudaFree(pts_stack);
+    cudaFree(pts_adj);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void cluster_pts_kernel_fast(int b, int n, int m, const float *__restrict__ xyz, const int *__restrict__ idx,
+    float *__restrict__ new_xyz, int *__restrict__ num) {
+    int bs_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b ) return;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * n;
+    new_xyz += bs_idx * m * 3;
+    num += bs_idx * m;
+    for(int i=0;i<n;i++) {
+        if (idx[i] == -1) continue;
+        int c_idx = idx[i];
+        new_xyz[c_idx * 3 + 0] += xyz[i * 3 + 0];
+        new_xyz[c_idx * 3 + 1] += xyz[i * 3 + 1];
+        new_xyz[c_idx * 3 + 2] += xyz[i * 3 + 2];
+        num[c_idx] += 1;
+    }
+    for(int i=0;i<m;i++) {
+        if (num[i] == 0) break;
+        new_xyz[i * 3 + 0] /= num[i];
+        new_xyz[i * 3 + 1] /= num[i];
+        new_xyz[i * 3 + 2] /= num[i];
+    }
+}
+void cluster_pts_kernel_launcher_fast(int b, int n, int m, const float *xyz, const int *idx, float *new_xyz, int *num) {
+    cudaError_t err;
+    dim3 blocks(DIVUP(b, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    cluster_pts_kernel_fast<<<blocks, threads>>>(b, n, m, xyz, idx, new_xyz, num);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}

pc_util/src/cluster_gpu.h ADDED Viewed

	@@ -0,0 +1,34 @@

+#ifndef _CLUSTER_GPU_H
+#define _CLUSTER_GPU_H
+#include <torch/serialize/tensor.h>
+#include <vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+int dbscan_wrapper_fast(int b, int n, float eps, int min_pts, at::Tensor xyz_tensor, at::Tensor idx_tensor);
+void dbscan_kernel_launcher_fast(int b, int n, float eps, int min_pts, const float *xyz, int *idx);
+int cluster_pts_wrapper_fast(int b, int n, int m, at::Tensor xyz_tensor, at::Tensor idx_tensor,
+    at::Tensor new_xyz_tensor, at::Tensor num_tensor);
+void cluster_pts_kernel_launcher_fast(int b, int n, int m, const float *xyz, const int *idx, float *new_xyz, int *num);
+int dbscan_wrapper_stack(int b, int n, float eps, int min_pts, at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor,
+    at::Tensor idx_tensor);
+void dbscan_kernel_launcher_stack(int b, int n, float eps, int min_pts,
+    const float *xyz, const int *xyz_batch_cnt, int *idx);
+int cluster_pts_wrapper_stack(int B, at::Tensor xyz_tensor, at::Tensor xyz_batch_cnt_tensor, at::Tensor idx_tensor,
+    at::Tensor new_xyz_tensor, at::Tensor cluster_cnt_tensor);
+void cluster_pts_kernel_launcher_stack(int B, const float *xyz, const int *xyz_batch_cnt, int *idx,
+    const float *new_xyz, const int *cluster_cnt);
+#endif

pc_util/src/cuda_utils.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#ifndef _CUDA_UTILS_H
+#define _CUDA_UTILS_H
+#include <cmath>
+#define TOTAL_THREADS 1024
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+inline int opt_n_threads(int work_size) {
+    const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+    return max(min(1 << pow_2, TOTAL_THREADS), 1);
+}
+#endif

pc_util/src/group_points.cpp ADDED Viewed

	@@ -0,0 +1,98 @@

+#include <torch/serialize/tensor.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <vector>
+// #include <THC/THC.h>
+#include "group_points_gpu.h"
+// extern THCState *state;
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+#define CHECK_CUDA(x) do { \
+	  if (!x.type().is_cuda()) { \
+		      fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_CONTIGUOUS(x) do { \
+	  if (!x.is_contiguous()) { \
+		      fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
+    float *grad_points = grad_points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    const float *grad_out = grad_out_tensor.data<float>();
+    group_points_grad_kernel_launcher_fast(b, c, n, npoints, nsample, grad_out, idx, grad_points);
+    return 1;
+}
+int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor) {
+    const float *points = points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *out = out_tensor.data<float>();
+    group_points_kernel_launcher_fast(b, c, n, npoints, nsample, points, idx, out);
+    return 1;
+}
+int group_points_grad_wrapper_stack(int B, int M, int C, int N, int nsample,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor,
+    at::Tensor features_batch_cnt_tensor, at::Tensor grad_features_tensor) {
+    CHECK_INPUT(grad_out_tensor);
+    CHECK_INPUT(idx_tensor);
+    CHECK_INPUT(idx_batch_cnt_tensor);
+    CHECK_INPUT(features_batch_cnt_tensor);
+    CHECK_INPUT(grad_features_tensor);
+    const float *grad_out = grad_out_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    const int *idx_batch_cnt = idx_batch_cnt_tensor.data<int>();
+    const int *features_batch_cnt = features_batch_cnt_tensor.data<int>();
+    float *grad_features = grad_features_tensor.data<float>();
+    group_points_grad_kernel_launcher_stack(B, M, C, N, nsample, grad_out, idx, idx_batch_cnt, features_batch_cnt, grad_features);
+    return 1;
+}
+int group_points_wrapper_stack(int B, int M, int C, int nsample,
+    at::Tensor features_tensor, at::Tensor features_batch_cnt_tensor,
+    at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor, at::Tensor out_tensor) {
+    CHECK_INPUT(features_tensor);
+    CHECK_INPUT(features_batch_cnt_tensor);
+    CHECK_INPUT(idx_tensor);
+    CHECK_INPUT(idx_batch_cnt_tensor);
+    CHECK_INPUT(out_tensor);
+    const float *features = features_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    const int *features_batch_cnt = features_batch_cnt_tensor.data<int>();
+    const int *idx_batch_cnt = idx_batch_cnt_tensor.data<int>();
+    float *out = out_tensor.data<float>();
+    group_points_kernel_launcher_stack(B, M, C, nsample, features, features_batch_cnt, idx, idx_batch_cnt, out);
+    return 1;
+}

pc_util/src/group_points_gpu.cu ADDED Viewed

	@@ -0,0 +1,199 @@

+#include <stdio.h>
+#include <stdlib.h>
+#include "cuda_utils.h"
+#include "group_points_gpu.h"
+__global__ void group_points_grad_kernel_fast(int b, int c, int n, int npoints, int nsample,
+    const float *__restrict__ grad_out, const int *__restrict__ idx, float *__restrict__ grad_points) {
+    // grad_out: (B, C, npoints, nsample)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      grad_points: (B, C, N)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+    int sample_idx = index % nsample;
+    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0] , grad_out[0]);
+}
+void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
+    const float *grad_out, const int *idx, float *grad_points) {
+    // grad_out: (B, C, npoints, nsample)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      grad_points: (B, C, N)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    group_points_grad_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, nsample, grad_out, idx, grad_points);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void group_points_kernel_fast(int b, int c, int n, int npoints, int nsample,
+    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
+    // points: (B, C, N)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      out: (B, C, npoints, nsample)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+    int sample_idx = index % nsample;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    out[out_idx] = points[in_idx];
+}
+void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
+    const float *points, const int *idx, float *out) {
+    // points: (B, C, N)
+    // idx: (B, npoints, nsample)
+    // output:
+    //      out: (B, C, npoints, nsample)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    group_points_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, nsample, points, idx, out);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void group_points_grad_kernel_stack(int B, int M, int C, int N, int nsample,
+    const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features) {
+    // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the output from forward
+    // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
+    // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
+    // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
+    // :return:
+    //     grad_features: (N1 + N2 ..., C) gradient of the features
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int sample_idx = index % nsample;
+    int C_idx = (index / nsample) % C;
+    int pt_idx = (index / nsample / C);
+    if (pt_idx >= M || C_idx >= C || sample_idx >= nsample) return;
+    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
+    for (int k = 1; k < B; k++){
+        if (pt_idx < pt_cnt) break;
+        pt_cnt += idx_batch_cnt[k];
+        bs_idx = k;
+    }
+    int features_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++) features_batch_start_idx += features_batch_cnt[k];
+    grad_out += pt_idx * C * nsample + C_idx * nsample + sample_idx;
+    idx += pt_idx * nsample + sample_idx;
+    grad_features += (features_batch_start_idx + idx[0]) * C + C_idx;
+    atomicAdd(grad_features, grad_out[0]);
+}
+void group_points_grad_kernel_launcher_stack(int B, int M, int C, int N, int nsample,
+    const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features) {
+    // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the output from forward
+    // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
+    // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
+    // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
+    // :return:
+    //     grad_features: (N1 + N2 ..., C) gradient of the features
+    cudaError_t err;
+    // dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 blocks(DIVUP(M * C * nsample, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    group_points_grad_kernel_stack<<<blocks, threads>>>(B, M, C, N, nsample, grad_out, idx, idx_batch_cnt, features_batch_cnt, grad_features);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void group_points_kernel_stack(int B, int M, int C, int nsample,
+    const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out) {
+    // :param features: (N1 + N2 ..., C) tensor of features to group
+    // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
+    // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
+    // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
+    // :return:
+    //     output: (M1 + M2, C, nsample) tensor
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int sample_idx = index % nsample;
+    int C_idx = (index / nsample) % C;
+    int pt_idx = (index / nsample / C);
+    if (pt_idx >= M || C_idx >= C || sample_idx >= nsample) return;
+    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
+    for (int k = 1; k < B; k++){
+        if (pt_idx < pt_cnt) break;
+        pt_cnt += idx_batch_cnt[k];
+        bs_idx = k;
+    }
+    int features_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++) features_batch_start_idx += features_batch_cnt[k];
+    features += features_batch_start_idx * C;
+    idx += pt_idx * nsample + sample_idx;
+    int in_idx = idx[0] * C + C_idx;
+    int out_idx = pt_idx * C * nsample + C_idx * nsample + sample_idx;
+    out[out_idx] = features[in_idx];
+}
+void group_points_kernel_launcher_stack(int B, int M, int C, int nsample,
+    const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out) {
+    // :param features: (N1 + N2 ..., C) tensor of features to group
+    // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the indicies of features to group with
+    // :param idx: (M1 + M2 ..., nsample) tensor containing the indicies of features to group with
+    // :param idx_batch_cnt: (batch_size) [M1 + M2 ...] tensor containing the indicies of features to group with
+    // :return:
+    //     output: (M1 + M2, C, nsample) tensor
+    cudaError_t err;
+    dim3 blocks(DIVUP(M * C * nsample, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    group_points_kernel_stack<<<blocks, threads>>>(B, M, C, nsample, features, features_batch_cnt, idx, idx_batch_cnt, out);
+    // cudaDeviceSynchronize();  // for using printf in kernel function
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}

pc_util/src/group_points_gpu.h ADDED Viewed

	@@ -0,0 +1,36 @@

+#ifndef _GROUP_POINTS_GPU_H
+#define _GROUP_POINTS_GPU_H
+#include <torch/serialize/tensor.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <vector>
+int group_points_wrapper_fast(int b, int c, int n, int npoints, int nsample,
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
+void group_points_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
+    const float *points, const int *idx, float *out);
+int group_points_grad_wrapper_fast(int b, int c, int n, int npoints, int nsample,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
+void group_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints, int nsample,
+    const float *grad_out, const int *idx, float *grad_points);
+int group_points_wrapper_stack(int B, int M, int C, int nsample,
+    at::Tensor features_tensor, at::Tensor features_batch_cnt_tensor,
+    at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor, at::Tensor out_tensor);
+void group_points_kernel_launcher_stack(int B, int M, int C, int nsample,
+    const float *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, float *out);
+int group_points_grad_wrapper_stack(int B, int M, int C, int N, int nsample,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor idx_batch_cnt_tensor,
+    at::Tensor features_batch_cnt_tensor, at::Tensor grad_features_tensor);
+void group_points_grad_kernel_launcher_stack(int B, int M, int C, int N, int nsample,
+    const float *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, float *grad_features);
+#endif

pc_util/src/interpolate.cpp ADDED Viewed

	@@ -0,0 +1,148 @@

+#include <torch/serialize/tensor.h>
+#include <vector>
+// #include <THC/THC.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "interpolate_gpu.h"
+// extern THCState *state;
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+#define CHECK_CUDA(x) do { \
+	  if (!x.type().is_cuda()) { \
+		      fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_CONTIGUOUS(x) do { \
+	  if (!x.is_contiguous()) { \
+		      fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
+		      exit(-1); \
+		    } \
+} while (0)
+#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
+void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
+    at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor) {
+    const float *unknown = unknown_tensor.data<float>();
+    const float *known = known_tensor.data<float>();
+    float *dist2 = dist2_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    three_nn_kernel_launcher_fast(b, n, m, unknown, known, dist2, idx);
+}
+void three_interpolate_wrapper_fast(int b, int c, int m, int n,
+                         at::Tensor points_tensor,
+                         at::Tensor idx_tensor,
+                         at::Tensor weight_tensor,
+                         at::Tensor out_tensor) {
+    const float *points = points_tensor.data<float>();
+    const float *weight = weight_tensor.data<float>();
+    float *out = out_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    three_interpolate_kernel_launcher_fast(b, c, m, n, points, idx, weight, out);
+}
+void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m,
+                            at::Tensor grad_out_tensor,
+                            at::Tensor idx_tensor,
+                            at::Tensor weight_tensor,
+                            at::Tensor grad_points_tensor) {
+    const float *grad_out = grad_out_tensor.data<float>();
+    const float *weight = weight_tensor.data<float>();
+    float *grad_points = grad_points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    three_interpolate_grad_kernel_launcher_fast(b, c, n, m, grad_out, idx, weight, grad_points);
+}
+void three_nn_wrapper_stack(at::Tensor unknown_tensor,
+    at::Tensor unknown_batch_cnt_tensor, at::Tensor known_tensor,
+    at::Tensor known_batch_cnt_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor){
+    // unknown: (N1 + N2 ..., 3)
+    // unknown_batch_cnt: (batch_size), [N1, N2, ...]
+    // known: (M1 + M2 ..., 3)
+    // known_batch_cnt: (batch_size), [M1, M2, ...]
+    // Return:
+    // dist: (N1 + N2 ..., 3)  l2 distance to the three nearest neighbors
+    // idx: (N1 + N2 ..., 3)  index of the three nearest neighbors
+    CHECK_INPUT(unknown_tensor);
+    CHECK_INPUT(unknown_batch_cnt_tensor);
+    CHECK_INPUT(known_tensor);
+    CHECK_INPUT(known_batch_cnt_tensor);
+    CHECK_INPUT(dist2_tensor);
+    CHECK_INPUT(idx_tensor);
+    int batch_size = unknown_batch_cnt_tensor.size(0);
+    int N = unknown_tensor.size(0);
+    int M = known_tensor.size(0);
+    const float *unknown = unknown_tensor.data<float>();
+    const int *unknown_batch_cnt = unknown_batch_cnt_tensor.data<int>();
+    const float *known = known_tensor.data<float>();
+    const int *known_batch_cnt = known_batch_cnt_tensor.data<int>();
+    float *dist2 = dist2_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    three_nn_kernel_launcher_stack(batch_size, N, M, unknown, unknown_batch_cnt, known, known_batch_cnt, dist2, idx);
+}
+void three_interpolate_wrapper_stack(at::Tensor features_tensor,
+    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor) {
+    // features_tensor: (M1 + M2 ..., C)
+    // idx_tensor: [N1 + N2 ..., 3]
+    // weight_tensor: [N1 + N2 ..., 3]
+    // Return:
+    // out_tensor: (N1 + N2 ..., C)
+    CHECK_INPUT(features_tensor);
+    CHECK_INPUT(idx_tensor);
+    CHECK_INPUT(weight_tensor);
+    CHECK_INPUT(out_tensor);
+    int N = out_tensor.size(0);
+    int channels = features_tensor.size(1);
+    const float *features = features_tensor.data<float>();
+    const float *weight = weight_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *out = out_tensor.data<float>();
+    three_interpolate_kernel_launcher_stack(N, channels, features, idx, weight, out);
+}
+void three_interpolate_grad_wrapper_stack(at::Tensor grad_out_tensor, at::Tensor idx_tensor,
+    at::Tensor weight_tensor, at::Tensor grad_features_tensor) {
+    // grad_out_tensor: (N1 + N2 ..., C)
+    // idx_tensor: [N1 + N2 ..., 3]
+    // weight_tensor: [N1 + N2 ..., 3]
+    // Return:
+    // grad_features_tensor: (M1 + M2 ..., C)
+    CHECK_INPUT(grad_out_tensor);
+    CHECK_INPUT(idx_tensor);
+    CHECK_INPUT(weight_tensor);
+    CHECK_INPUT(grad_features_tensor);
+    int N = grad_out_tensor.size(0);
+    int channels = grad_out_tensor.size(1);
+    const float *grad_out = grad_out_tensor.data<float>();
+    const float *weight = weight_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *grad_features = grad_features_tensor.data<float>();
+    // printf("N=%d, channels=%d\n", N, channels);
+    three_interpolate_grad_kernel_launcher_stack(N, channels, grad_out, idx, weight, grad_features);
+}

pc_util/src/interpolate_gpu.cu ADDED Viewed

	@@ -0,0 +1,343 @@

+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "cuda_utils.h"
+#include "interpolate_gpu.h"
+__global__ void three_nn_kernel_fast(int b, int n, int m, const float *__restrict__ unknown,
+    const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) {
+    // unknown: (B, N, 3)
+    // known: (B, M, 3)
+    // output:
+    //      dist2: (B, N, 3)
+    //      idx: (B, N, 3)
+    int bs_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || pt_idx >= n) return;
+    unknown += bs_idx * n * 3 + pt_idx * 3;
+    known += bs_idx * m * 3;
+    dist2 += bs_idx * n * 3 + pt_idx * 3;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    float ux = unknown[0];
+    float uy = unknown[1];
+    float uz = unknown[2];
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < m; ++k) {
+        float x = known[k * 3 + 0];
+        float y = known[k * 3 + 1];
+        float z = known[k * 3 + 2];
+        float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+        if (d < best1) {
+            best3 = best2; besti3 = besti2;
+            best2 = best1; besti2 = besti1;
+            best1 = d; besti1 = k;
+        }
+        else if (d < best2) {
+            best3 = best2; besti3 = besti2;
+            best2 = d; besti2 = k;
+        }
+        else if (d < best3) {
+            best3 = d; besti3 = k;
+        }
+    }
+    dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
+    idx[0] = besti1; idx[1] = besti2; idx[2] = besti3;
+}
+void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
+    const float *known, float *dist2, int *idx) {
+    // unknown: (B, N, 3)
+    // known: (B, M, 3)
+    // output:
+    //      dist2: (B, N, 3)
+    //      idx: (B, N, 3)
+    cudaError_t err;
+    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    three_nn_kernel_fast<<<blocks, threads>>>(b, n, m, unknown, known, dist2, idx);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void three_interpolate_kernel_fast(int b, int c, int m, int n, const float *__restrict__ points,
+    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) {
+    // points: (B, C, M)
+    // idx: (B, N, 3)
+    // weight: (B, N, 3)
+    // output:
+    //      out: (B, C, N)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    out += bs_idx * c * n + c_idx * n;
+    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]];
+}
+void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
+    const float *points, const int *idx, const float *weight, float *out) {
+    // points: (B, C, M)
+    // idx: (B, N, 3)
+    // weight: (B, N, 3)
+    // output:
+    //      out: (B, C, N)
+    cudaError_t err;
+    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    three_interpolate_kernel_fast<<<blocks, threads>>>(b, c, m, n, points, idx, weight, out);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void three_interpolate_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out,
+    const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) {
+    // grad_out: (B, C, N)
+    // weight: (B, N, 3)
+    // output:
+    //      grad_points: (B, C, M)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    grad_points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+}
+void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
+    const int *idx, const float *weight, float *grad_points) {
+    // grad_out: (B, C, N)
+    // weight: (B, N, 3)
+    // output:
+    //      grad_points: (B, C, M)
+    cudaError_t err;
+    dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    three_interpolate_grad_kernel_fast<<<blocks, threads>>>(b, c, n, m, grad_out, idx, weight, grad_points);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void three_nn_kernel_stack(int batch_size, int N, int M, const float *unknown,
+    const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt,
+    float *dist2, int *idx) {
+    // unknown: (N1 + N2 ..., 3)
+    // unknown_batch_cnt: (batch_size), [N1, N2, ...]
+    // known: (M1 + M2 ..., 3)
+    // known_batch_cnt: (batch_size), [M1, M2, ...]
+    // Return:
+    // dist: (N1 + N2 ..., 3)  l2 distance to the three nearest neighbors
+    // idx: (N1 + N2 ..., 3)  index of the three nearest neighbors
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= N) return;
+    int bs_idx = 0, pt_cnt = unknown_batch_cnt[0];
+    for (int k = 1; k < batch_size; k++){
+        if (pt_idx < pt_cnt) break;
+        pt_cnt += unknown_batch_cnt[k];
+        bs_idx = k;
+    }
+    int cur_num_known_points = known_batch_cnt[bs_idx];
+    int known_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++) known_batch_start_idx += known_batch_cnt[k];
+    known += known_batch_start_idx * 3;
+    unknown += pt_idx * 3;
+    dist2 += pt_idx * 3;
+    idx += pt_idx * 3;
+    float ux = unknown[0];
+    float uy = unknown[1];
+    float uz = unknown[2];
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < cur_num_known_points; ++k) {
+        float x = known[k * 3 + 0];
+        float y = known[k * 3 + 1];
+        float z = known[k * 3 + 2];
+        float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+        if (d < best1) {
+            best3 = best2; besti3 = besti2;
+            best2 = best1; besti2 = besti1;
+            best1 = d; besti1 = k;
+        }
+        else if (d < best2) {
+            best3 = best2; besti3 = besti2;
+            best2 = d; besti2 = k;
+        }
+        else if (d < best3) {
+            best3 = d; besti3 = k;
+        }
+    }
+    dist2[0] = best1; dist2[1] = best2; dist2[2] = best3;
+    idx[0] = besti1 + known_batch_start_idx;
+    idx[1] = besti2 + known_batch_start_idx;
+    idx[2] = besti3 + known_batch_start_idx;
+}
+void three_nn_kernel_launcher_stack(int batch_size, int N, int M, const float *unknown,
+    const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt,
+    float *dist2, int *idx) {
+    // unknown: (N1 + N2 ..., 3)
+    // unknown_batch_cnt: (batch_size), [N1, N2, ...]
+    // known: (M1 + M2 ..., 3)
+    // known_batch_cnt: (batch_size), [M1, M2, ...]
+    // Return:
+    // dist: (N1 + N2 ..., 3)  l2 distance to the three nearest neighbors
+    // idx: (N1 + N2 ..., 3)  index of the three nearest neighbors
+    cudaError_t err;
+    dim3 blocks(DIVUP(N, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    three_nn_kernel_stack<<<blocks, threads>>>(
+        batch_size, N, M, unknown, unknown_batch_cnt,
+        known, known_batch_cnt, dist2, idx
+    );
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void three_interpolate_kernel_stack(int N, int channels, const float *features,
+    const int *idx, const float *weight, float *out) {
+    // features: (M1 + M2 ..., C)
+    // idx: [N1 + N2 ..., 3]
+    // weight: [N1 + N2 ..., 3]
+    // Return:
+    // out: (N1 + N2 ..., C)
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= N || c_idx >= channels) return;
+    weight += pt_idx * 3;
+    idx += pt_idx * 3;
+    out += pt_idx * channels + c_idx;
+    out[0] = weight[0] * features[idx[0] * channels + c_idx] +
+        weight[1] * features[idx[1] * channels + c_idx] +
+        weight[2] * features[idx[2] * channels + c_idx];
+}
+void three_interpolate_kernel_launcher_stack(int N, int channels,
+    const float *features, const int *idx, const float *weight, float *out) {
+    // features: (M1 + M2 ..., C)
+    // idx: [N1 + N2 ..., 3]
+    // weight: [N1 + N2 ..., 3]
+    // Return:
+    // out: (N1 + N2 ..., C)
+    cudaError_t err;
+    dim3 blocks(DIVUP(N, THREADS_PER_BLOCK), channels);
+    dim3 threads(THREADS_PER_BLOCK);
+    three_interpolate_kernel_stack<<<blocks, threads>>>(N, channels, features, idx, weight, out);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void three_interpolate_grad_kernel_stack(int N, int channels, const float *grad_out,
+    const int *idx, const float *weight, float *grad_features) {
+    // grad_out_tensor: (N1 + N2 ..., C)
+    // idx_tensor: [N1 + N2 ..., 3]
+    // weight_tensor: [N1 + N2 ..., 3]
+    // Return:
+    // grad_features_tensor: (M1 + M2 ..., C)
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pt_idx >= N || c_idx >= channels) return;
+    grad_out += pt_idx * channels + c_idx;
+    weight += pt_idx * 3;
+    idx += pt_idx * 3;
+    // printf("pt_idx=%d, c_idx=%d, idx=(%d, %d, %d), grad_out=%f\n", pt_idx, c_idx, idx[0], idx[1], idx[2], grad_out[0]);
+    atomicAdd(grad_features + idx[0] * channels + c_idx, grad_out[0] * weight[0]);
+    atomicAdd(grad_features + idx[1] * channels + c_idx, grad_out[0] * weight[1]);
+    atomicAdd(grad_features + idx[2] * channels + c_idx, grad_out[0] * weight[2]);
+}
+void three_interpolate_grad_kernel_launcher_stack(int N, int channels, const float *grad_out,
+    const int *idx, const float *weight, float *grad_features) {
+    // grad_out_tensor: (N1 + N2 ..., C)
+    // idx_tensor: [N1 + N2 ..., 3]
+    // weight_tensor: [N1 + N2 ..., 3]
+    // Return:
+    // grad_features_tensor: (M1 + M2 ..., C)
+    cudaError_t err;
+    dim3 blocks(DIVUP(N, THREADS_PER_BLOCK), channels);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    three_interpolate_grad_kernel_stack<<<blocks, threads>>>(
+        N, channels, grad_out, idx, weight, grad_features
+    );
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}

pc_util/src/interpolate_gpu.h ADDED Viewed

	@@ -0,0 +1,61 @@

+#ifndef _INTERPOLATE_GPU_H
+#define _INTERPOLATE_GPU_H
+#include <torch/serialize/tensor.h>
+#include<vector>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+void three_nn_wrapper_fast(int b, int n, int m, at::Tensor unknown_tensor,
+  at::Tensor known_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
+void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown,
+	const float *known, float *dist2, int *idx);
+void three_interpolate_wrapper_fast(int b, int c, int m, int n, at::Tensor points_tensor,
+    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
+void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n,
+    const float *points, const int *idx, const float *weight, float *out);
+void three_interpolate_grad_wrapper_fast(int b, int c, int n, int m, at::Tensor grad_out_tensor,
+    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_points_tensor);
+void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out,
+    const int *idx, const float *weight, float *grad_points);
+void three_nn_wrapper_stack(at::Tensor unknown_tensor,
+    at::Tensor unknown_batch_cnt_tensor, at::Tensor known_tensor,
+    at::Tensor known_batch_cnt_tensor, at::Tensor dist2_tensor, at::Tensor idx_tensor);
+void three_interpolate_wrapper_stack(at::Tensor features_tensor,
+    at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor out_tensor);
+void three_interpolate_grad_wrapper_stack(at::Tensor grad_out_tensor, at::Tensor idx_tensor,
+    at::Tensor weight_tensor, at::Tensor grad_features_tensor);
+void three_nn_kernel_launcher_stack(int batch_size, int N, int M, const float *unknown,
+    const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt,
+    float *dist2, int *idx);
+void three_interpolate_kernel_launcher_stack(int N, int channels,
+    const float *features, const int *idx, const float *weight, float *out);
+void three_interpolate_grad_kernel_launcher_stack(int N, int channels, const float *grad_out,
+    const int *idx, const float *weight, float *grad_features);
+#endif

pc_util/src/pointnet2_api.cpp ADDED Viewed

	@@ -0,0 +1,41 @@

+#include <torch/serialize/tensor.h>
+#include <torch/extension.h>
+#include "ball_query_gpu.h"
+#include "group_points_gpu.h"
+#include "sampling_gpu.h"
+#include "interpolate_gpu.h"
+#include "cluster_gpu.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("ball_query_wrapper", &ball_query_wrapper_fast, "ball_query_wrapper_fast");
+    m.def("ball_center_query_wrapper", &ball_center_query_wrapper_fast, "ball_center_query_wrapper_fast");
+    m.def("knn_query_wrapper", &knn_query_wrapper_fast, "knn_query_wrapper_fast");
+    m.def("group_points_wrapper", &group_points_wrapper_fast, "group_points_wrapper_fast");
+    m.def("group_points_grad_wrapper", &group_points_grad_wrapper_fast, "group_points_grad_wrapper_fast");
+    m.def("gather_points_wrapper", &gather_points_wrapper_fast, "gather_points_wrapper_fast");
+    m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper_fast, "gather_points_grad_wrapper_fast");
+    m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper, "furthest_point_sampling_wrapper");
+    m.def("three_nn_wrapper", &three_nn_wrapper_fast, "three_nn_wrapper_fast");
+    m.def("three_interpolate_wrapper", &three_interpolate_wrapper_fast, "three_interpolate_wrapper_fast");
+    m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper_fast, "three_interpolate_grad_wrapper_fast");
+    m.def("dbscan_wrapper", &dbscan_wrapper_fast, "dbscan_wrapper_fast");
+    m.def("cluster_pts_wrapper", &cluster_pts_wrapper_fast, "cluster_pts_wrapper_fast");
+    m.def("ball_query_wrapper_stack", &ball_query_wrapper_stack, "ball_query_wrapper_stack");
+    m.def("group_points_wrapper_stack", &group_points_wrapper_stack, "group_points_wrapper_stack");
+    m.def("group_points_grad_wrapper_stack", &group_points_grad_wrapper_stack, "group_points_grad_wrapper_stack");
+    m.def("three_nn_wrapper_stack", &three_nn_wrapper_stack, "three_nn_wrapper_stack");
+    m.def("three_interpolate_wrapper_stack", &three_interpolate_wrapper_stack, "three_interpolate_wrapper_stack");
+    m.def("three_interpolate_grad_wrapper_stack", &three_interpolate_grad_wrapper_stack, "three_interpolate_grad_wrapper_stack");
+}

pc_util/src/sampling.cpp ADDED Viewed

	@@ -0,0 +1,46 @@

+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <vector>
+// #include <THC/THC.h>
+#include "sampling_gpu.h"
+// extern THCState *state;
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+// cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+int gather_points_wrapper_fast(int b, int c, int n, int npoints,
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor){
+    const float *points = points_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *out = out_tensor.data<float>();
+    gather_points_kernel_launcher_fast(b, c, n, npoints, points, idx, out);
+    return 1;
+}
+int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor) {
+    const float *grad_out = grad_out_tensor.data<float>();
+    const int *idx = idx_tensor.data<int>();
+    float *grad_points = grad_points_tensor.data<float>();
+    gather_points_grad_kernel_launcher_fast(b, c, n, npoints, grad_out, idx, grad_points);
+    return 1;
+}
+int furthest_point_sampling_wrapper(int b, int c, int n, int m, float w1, float w2,
+    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor) {
+    const float *points = points_tensor.data<float>();
+    float *temp = temp_tensor.data<float>();
+    int *idx = idx_tensor.data<int>();
+    furthest_point_sampling_kernel_launcher(b, c, n, m, w1, w2, points, temp, idx);
+    return 1;
+}

pc_util/src/sampling_gpu.cu ADDED Viewed

	@@ -0,0 +1,259 @@

+#include <stdio.h>
+#include <stdlib.h>
+#include "cuda_utils.h"
+#include "sampling_gpu.h"
+__global__ void gather_points_kernel_fast(int b, int c, int n, int m,
+    const float *__restrict__ points, const int *__restrict__ idx, float *__restrict__ out) {
+    // points: (B, C, N)
+    // idx: (B, M)
+    // output:
+    //      out: (B, C, M)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+    out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    points += bs_idx * c * n + c_idx * n;
+    out[0] = points[idx[0]];
+}
+void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
+    const float *points, const int *idx, float *out) {
+    // points: (B, C, N)
+    // idx: (B, npoints)
+    // output:
+    //      out: (B, C, npoints)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    gather_points_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, points, idx, out);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__global__ void gather_points_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out,
+    const int *__restrict__ idx, float *__restrict__ grad_points) {
+    // grad_out: (B, C, M)
+    // idx: (B, M)
+    // output:
+    //      grad_points: (B, C, N)
+    int bs_idx = blockIdx.z;
+    int c_idx = blockIdx.y;
+    int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    grad_points += bs_idx * c * n + c_idx * n;
+    atomicAdd(grad_points + idx[0], grad_out[0]);
+}
+void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
+    const float *grad_out, const int *idx, float *grad_points) {
+    // grad_out: (B, C, npoints)
+    // idx: (B, npoints)
+    // output:
+    //      grad_points: (B, C, N)
+    cudaError_t err;
+    dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);  // blockIdx.x(col), blockIdx.y(row)
+    dim3 threads(THREADS_PER_BLOCK);
+    gather_points_grad_kernel_fast<<<blocks, threads>>>(b, c, n, npoints, grad_out, idx, grad_points);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2){
+    const float v1 = dists[idx1], v2 = dists[idx2];
+    const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+    dists[idx1] = max(v1, v2);
+    dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_kernel(int b, int c, int n, int m, float w1, float w2,
+    const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) {
+    // dataset: (B, N, 3)
+    // tmp: (B, N)
+    // output:
+    //      idx: (B, M)
+    if (m <= 0) return;
+    __shared__ float dists[block_size];
+    __shared__ int dists_i[block_size];
+    int batch_index = blockIdx.x;
+    dataset += batch_index * n * c;
+    temp += batch_index * n;
+    idxs += batch_index * m;
+    int tid = threadIdx.x;
+    const int stride = block_size;
+    int old = 0;
+    if (threadIdx.x == 0)
+    idxs[0] = old;
+    __syncthreads();
+    for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * c + 0];
+    float y1 = dataset[old * c + 1];
+    float z1 = dataset[old * c + 2];
+    for (int k = tid; k < n; k += stride) {
+        float x2, y2, z2;
+        x2 = dataset[k * c + 0];
+        y2 = dataset[k * c + 1];
+        z2 = dataset[k * c + 2];
+        // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+        // if (mag <= 1e-3)
+        // continue;
+        float xyz_d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+        float fea_d = 0;
+        for (int l = 3; l < c; l++) {
+        fea_d += (dataset[old * c + l] - dataset[k * c + l]) * (dataset[old * c + l] - dataset[k * c + l]);
+        }
+        float d = w1 * xyz_d + w2 * fea_d;
+        float d2 = min(d, temp[k]);
+        temp[k] = d2;
+        besti = d2 > best ? k : besti;
+        best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+    if (block_size >= 1024) {
+        if (tid < 512) {
+            __update(dists, dists_i, tid, tid + 512);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 512) {
+        if (tid < 256) {
+            __update(dists, dists_i, tid, tid + 256);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 256) {
+        if (tid < 128) {
+            __update(dists, dists_i, tid, tid + 128);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 128) {
+        if (tid < 64) {
+            __update(dists, dists_i, tid, tid + 64);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 64) {
+        if (tid < 32) {
+            __update(dists, dists_i, tid, tid + 32);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 32) {
+        if (tid < 16) {
+            __update(dists, dists_i, tid, tid + 16);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 16) {
+        if (tid < 8) {
+            __update(dists, dists_i, tid, tid + 8);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 8) {
+        if (tid < 4) {
+            __update(dists, dists_i, tid, tid + 4);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 4) {
+        if (tid < 2) {
+            __update(dists, dists_i, tid, tid + 2);
+        }
+        __syncthreads();
+    }
+    if (block_size >= 2) {
+        if (tid < 1) {
+            __update(dists, dists_i, tid, tid + 1);
+        }
+        __syncthreads();
+    }
+    old = dists_i[0];
+    if (tid == 0)
+        idxs[j] = old;
+    }
+}
+void furthest_point_sampling_kernel_launcher(int b, int c, int n, int m, float w1, float w2,
+    const float *dataset, float *temp, int *idxs) {
+    // dataset: (B, N, 3)
+    // tmp: (B, N)
+    // output:
+    //      idx: (B, M)
+    cudaError_t err;
+    unsigned int n_threads = opt_n_threads(n);
+    switch (n_threads) {
+        case 1024:
+        furthest_point_sampling_kernel<1024><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 512:
+        furthest_point_sampling_kernel<512><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 256:
+        furthest_point_sampling_kernel<256><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 128:
+        furthest_point_sampling_kernel<128><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 64:
+        furthest_point_sampling_kernel<64><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 32:
+        furthest_point_sampling_kernel<32><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 16:
+        furthest_point_sampling_kernel<16><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 8:
+        furthest_point_sampling_kernel<8><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 4:
+        furthest_point_sampling_kernel<4><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 2:
+        furthest_point_sampling_kernel<2><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        case 1:
+        furthest_point_sampling_kernel<1><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs); break;
+        default:
+        furthest_point_sampling_kernel<512><<<b, n_threads>>>(b, c, n, m, w1, w2, dataset, temp, idxs);
+    }
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+        exit(-1);
+    }
+}

pc_util/src/sampling_gpu.h ADDED Viewed

	@@ -0,0 +1,29 @@

+#ifndef _SAMPLING_GPU_H
+#define _SAMPLING_GPU_H
+#include <torch/serialize/tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include<vector>
+int gather_points_wrapper_fast(int b, int c, int n, int npoints,
+    at::Tensor points_tensor, at::Tensor idx_tensor, at::Tensor out_tensor);
+void gather_points_kernel_launcher_fast(int b, int c, int n, int npoints,
+    const float *points, const int *idx, float *out);
+int gather_points_grad_wrapper_fast(int b, int c, int n, int npoints,
+    at::Tensor grad_out_tensor, at::Tensor idx_tensor, at::Tensor grad_points_tensor);
+void gather_points_grad_kernel_launcher_fast(int b, int c, int n, int npoints,
+    const float *grad_out, const int *idx, float *grad_points);
+int furthest_point_sampling_wrapper(int b, int c, int n, int m, float w1, float w2,
+    at::Tensor points_tensor, at::Tensor temp_tensor, at::Tensor idx_tensor);
+void furthest_point_sampling_kernel_launcher(int b, int c, int n, int m, float w1, float w2,
+    const float *dataset, float *temp, int *idxs);
+#endif