indices(i, 0) = (int)minidx;
count((int)minidx, 0) += 1;
}
__global__ void centersAdd(cv::cuda::PtrStepSz<float> centers, cv::cuda::PtrStepSz<float> samples, cv::cuda::PtrStepSz<int> indices, int N, int dim) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int i = col + row * N;
if (i < N) {
int index = indices(i, 0);
for (int d = 0; d < dim; d++) {
centers(index, d) += samples(i, d);
}
}
}
__global__ void centersDivide(cv::cuda::PtrStepSz<float> centers, cv::cuda::PtrStepSz<int> count, int ncluster, int dim) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int i = col + row * ncluster;
if (i >= ncluster)
return;
for (int d = 0; d<dim; d++) {
centers(i, d) /= (float)count(i, 0);
}
}
__global__ void zeroFloat(cv::cuda::PtrStepSz<float> out) {
int i = blockIdx.y*blockDim.y + threadIdx.y;
int j = blockIdx.x*blockDim.x + threadIdx.x;
if (i >= out.rows || j >= out.cols)
return;
out(i, j) = 0.0f;
}
__global__ void zeroInt(cv::cuda::PtrStepSz<int> out) {
int i = blockIdx.y*blockDim.y + threadIdx.y;
int j = blockIdx.x*blockDim.x + threadIdx.x;
if (i >= out.rows || j >= out.cols)
return;
out(i, j) = 0;
}
}
After programming this kernel. we noticed an improvement in performace. Here are is a graph comparing the run-times of the serial program vs parallelized. [[File:Assignment2Graph.png]]
==== Conclusion ====