By implementing this kernel, the process time of the blur effect had made a significant improvement, compared to it's serial counter-part as illustrated in the graph below.
<br><br>
==== Kernel Code ===
<syntaxhighlight lang="cpp">
__global__
output_image[offset * 3 + 2] = static_cast<unsigned char>(output_blue / hits);
}
}
</syntaxhighlight>
=== Launching the Kernel ===
<syntaxhighlight lang="cpp">
void filter(const Mat& input, Mat& output, int width, int height, int neighbour)
{
//Calculate total number of bytes of input and output image
const int colorBytes = input.step * input.rows;
const int grayBytes = output.step * output.rows;
unsigned char *d_input, *d_output;
//Allocate device memory
cudaMalloc((void**)&d_input, width*height * 3 * sizeof(unsigned char));
cudaMalloc((void**)&d_output, width*height * 3 * sizeof(unsigned char));
//Copy data from OpenCV input image to device memory
cudaMemcpy(d_input, input.ptr(), width*height * 3 * sizeof(unsigned char), cudaMemcpyHostToDevice);
//cudaMemcpy(d_input, input.ptr(), colorBytes, cudaMemcpyHostToDevice);
dim3 blockDims(512, 1, 1);
//Calculate grid size to cover the whole image
dim3 gridDims((unsigned int)ceil((double)(width*height * 3 / blockDims.x)), 1, 1);
//Launch the color conversion kernel
blur << <gridDims, blockDims >> >(d_input, d_output, input.cols, input.rows, neighbour);
//Synchronize to check for any kernel launch errors
cudaDeviceSynchronize();
//Copy back data from destination device meory to OpenCV output image
cudaMemcpy(output.ptr(), d_output, width*height * 3 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
//Free the device memory
cudaFree(d_input);
cudaFree(d_output);
}
</syntaxhighlight>