36
edits
Changes
→Analysis
cudaDeviceSynchronize();
The kernels:
__global__ void setSamples(cv::cuda::PtrStepSz<float> samples, cv::cuda::PtrStepSz<uchar> img, int dimC) {
int i = blockIdx.y*blockDim.y + threadIdx.y;
int j = blockIdx.x*blockDim.x + threadIdx.x;
if (i >= img.rows || j >= img.cols)
return;
int index = i * img.cols + j;
for (int d = 0; d<dimC; d++) {
samples(index, d) = (float)img(i, j * dimC + d);
}
}
__global__ void calculateDistance(cv::cuda::PtrStepSz<float> centers, cv::cuda::PtrStepSz<float> samples, int k, int N, int dim, double* minval, float* D) {
// Compute distances between already sampled centers and other input samples.
// Update nearest distance if it is smaller than previous ones.
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int i = col + row * N;
//int i = blockIdx.y*blockDim.y + threadIdx.y;
if (i >= N)
return;
double dist = 0.0;
for (int d = 0; d<dim; d++) {
double diff = centers(k - 1, d) - samples(i, d);
dist += diff * diff;
}
if (dist < minval[i]) {
minval[i] = dist;
}
*D += minval[i];
}
__global__ void sampleClassification(cv::cuda::PtrStepSz<int> indices, cv::cuda::PtrStepSz<int> count, cv::cuda::PtrStepSz<float> centers, cv::cuda::PtrStepSz<float> samples, int N, int ncluster, int dim) {
// Sample classification
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int i = col + row * N;
if (i >= N)
return;
double minidx = 0;
double minval = 1000000000;
for (int k = 0; k<ncluster; k++) {
double dist = 0.0;
for (int d = 0; d<dim; d++) {
float diff = centers(k, d) - samples(i, d);
dist += diff * diff;
}
if (minval > dist) {
minval = dist;
minidx = k;
}
}
indices(i, 0) = (int)minidx;
count((int)minidx, 0) += 1;
}
__global__ void centersAdd(cv::cuda::PtrStepSz<float> centers, cv::cuda::PtrStepSz<float> samples, cv::cuda::PtrStepSz<int> indices, int N, int dim) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int i = col + row * N;
if (i < N) {
int index = indices(i, 0);
for (int d = 0; d < dim; d++) {
centers(index, d) += samples(i, d);
}
}
}
__global__ void centersDivide(cv::cuda::PtrStepSz<float> centers, cv::cuda::PtrStepSz<int> count, int ncluster, int dim) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int i = col + row * ncluster;
if (i >= ncluster)
return;
for (int d = 0; d<dim; d++) {
centers(i, d) /= (float)count(i, 0);
}
}
__global__ void zeroFloat(cv::cuda::PtrStepSz<float> out) {
int i = blockIdx.y*blockDim.y + threadIdx.y;
int j = blockIdx.x*blockDim.x + threadIdx.x;
if (i >= out.rows || j >= out.cols)
return;
out(i, j) = 0.0f;
}
__global__ void zeroInt(cv::cuda::PtrStepSz<int> out) {
int i = blockIdx.y*blockDim.y + threadIdx.y;
int j = blockIdx.x*blockDim.x + threadIdx.x;
if (i >= out.rows || j >= out.cols)
return;
out(i, j) = 0;
}
__global__ void generateImage(cv::cuda::PtrStepSz<uchar> out, cv::cuda::PtrStepSz<int> indices, cv::cuda::PtrStepSz<float> centers, int dim) {
// Generate output image
int i = blockIdx.y*blockDim.y + threadIdx.y;
int j = blockIdx.x*blockDim.x + threadIdx.x;
if (i >= out.rows || j >= out.cols)
return;
int index = i * out.cols + j;
int ci = indices(index, 0);
for (int d = 0; d<dim; d++) {
out(i, j*dim + d) = (uchar)centers(ci, d);
}
}
After programming this kernel. we noticed an improvement in performace.