Changes

Jump to: navigation, search

Sirius

2,488 bytes added, 09:25, 9 April 2018
Kernel Code
By implementing this kernel, the process time of the blur effect had made a significant improvement, compared to it's serial counter-part as illustrated in the graph below.
<br><br>
=== Kernel Code ===
<syntaxhighlight lang="cpp">
__global__
void blur(unsigned char* input_image, unsigned char* output_image, int width, int height, int neighbour) {
 
const unsigned int offset = blockIdx.x*blockDim.x + threadIdx.x;
int x = offset % width;
int y = (offset - x) / width;
if (offset < width*height) {
 
float output_red = 0;
float output_green = 0;
float output_blue = 0;
int hits = 0;
for (int ox = -neighbour; ox < neighbour + 1; ++ox) {
for (int oy = -neighbour; oy < neighbour + 1; ++oy) {
if ((x + ox) > -1 && (x + ox) < width && (y + oy) > -1 && (y + oy) < height) {
const int currentoffset = (offset + ox + oy*width) * 3;
output_red += input_image[currentoffset];
output_green += input_image[currentoffset + 1];
output_blue += input_image[currentoffset + 2];
hits++;
}
}
}
output_image[offset * 3] = static_cast<unsigned char>(output_red / hits);
output_image[offset * 3 + 1] = static_cast<unsigned char>(output_green / hits);
output_image[offset * 3 + 2] = static_cast<unsigned char>(output_blue / hits);
}
}
</syntaxhighlight>
=== Launching the Kernel ===
<syntaxhighlight lang="cpp">
void filter(const Mat& input, Mat& output, int width, int height, int neighbour)
{
//Calculate total number of bytes of input and output image
const int colorBytes = input.step * input.rows;
const int grayBytes = output.step * output.rows;
 
unsigned char *d_input, *d_output;
 
//Allocate device memory
cudaMalloc((void**)&d_input, width*height * 3 * sizeof(unsigned char));
cudaMalloc((void**)&d_output, width*height * 3 * sizeof(unsigned char));
 
//Copy data from OpenCV input image to device memory
cudaMemcpy(d_input, input.ptr(), width*height * 3 * sizeof(unsigned char), cudaMemcpyHostToDevice);
//cudaMemcpy(d_input, input.ptr(), colorBytes, cudaMemcpyHostToDevice);
 
 
dim3 blockDims(512, 1, 1);
//Calculate grid size to cover the whole image
 
dim3 gridDims((unsigned int)ceil((double)(width*height * 3 / blockDims.x)), 1, 1);
//Launch the color conversion kernel
blur << <gridDims, blockDims >> >(d_input, d_output, input.cols, input.rows, neighbour);
 
//Synchronize to check for any kernel launch errors
cudaDeviceSynchronize();
 
//Copy back data from destination device meory to OpenCV output image
 
cudaMemcpy(output.ptr(), d_output, width*height * 3 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
//Free the device memory
cudaFree(d_input);
cudaFree(d_output);
}
</syntaxhighlight>
==== Graph ====
[[File:boxFilterFirst.png | 750px]]
81
edits

Navigation menu