You can find the new parallelized KmeansPlusPlus code
[https://github.com/agamdograMajinBui/KmeansPlusPlusCuda herekmeansplusplusCUDA].
Here is are the kernel kernels that we programmed.
__global__ void setCenter(float* d_center, float* d_sample, int n, int dim, int randi) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < n && j < n)
d_center[j * n + i] = d_sample[j * randi + i];
}
Launching the Set Samples kernel int nb = (n + ntpb - 1) / ntpb; dim3 dGrid(nb, nb, 1); dim3 dBlock(ntpb, ntpb, 1); float* d_center = nullptr; cudaMalloc((void**)&d_center, centers.rows * centers.cols * sizeof(float)); cudaMemcpy(d_center, (float*)centers.data, centers.rows * centers.cols * sizeof(float), cudaMemcpyHostToDevice); check(cudaGetLastError()); float* d_sample = nullptr; cudaMalloc((void**)&d_sample, samples.rows * samples.cols * sizeof(float)); cudaMemcpy(d_sample, (float*)samples.data, centers.rows * centers.cols * sizeof(float), cudaMemcpyHostToDevice); int rand = genrand_int31() % n; setCenter << <dGrid, dBlock >> >(d_center, d_sample, N, dim, rand); cudaDeviceSynchronize(); The kernels:
setSamples - goes through the entire image and collects samples from the image (the current pixel and the next x number of pixels).