116
edits
Changes
TeamDS
,→SDFGenerateCuda Kernel Optimized Phase 2
}
</syntaxhighlight >
=== Launch Config GPU Optimized Phase 2 ===
<syntaxhighlight lang="cpp">
int main(int argc, char **argv)
{
if (argc != 2)
{
cout << "Incorrect number of arg";
return 1;
}
char* path = argv[1];
BinaryBitmap* bitmap = LoadBitmap(path);
if (bitmap == NULL)
return 1;
int d;
cudaDeviceProp prop;
cudaGetDevice(&d);
cudaGetDeviceProperties(&prop, d);
unsigned ntpb = prop.maxThreadsDim[0];
int size = bitmap->GetSize();
int width = bitmap->_width;
int height = bitmap->_height;
// Allocate GPU memory
float* d_src;
float * d_dst;
int* d_xCoord;
int* d_yCoord;
cudaMalloc((void**)&d_src, size * sizeof(float));
cudaMalloc((void**)&d_dst, size * sizeof(float));
cudaMalloc((void**)&d_xCoord, size * sizeof(int));
cudaMalloc((void**)&d_yCoord, size * sizeof(int));
// Copy src to device src
cudaMemcpy(d_src, bitmap->_pixels, size * sizeof(float), cudaMemcpyHostToDevice);
// Prepare kernal launch
// Calc how many blocks to launch
int numOfBlocks = ((size + ntpb - 1) / ntpb);
// Launch grid for pre-calculating XYCoords
GenerateXYCoord << <numOfBlocks, ntpb >> >(d_xCoord, d_yCoord, width, height);
// Launch grid for converting
SDFGenerateCuda << <numOfBlocks, ntpb >> >(d_src, d_dst, d_xCoord, d_yCoord, size, 64);
// Wait for kernel to finish before copying
cudaDeviceSynchronize();
// buffer array for SDF pixels
float* dst = new float[bitmap->GetSize()];
cudaMemcpy(dst, d_dst, size * sizeof(float), cudaMemcpyDeviceToHost);
SaveBitmap(path, dst);
Pause();
// Free memory back
cudaFree(d_src);
cudaFree(d_dst);
delete bitmap;
delete dst;
return 0;
}
</syntaxhighlight >