Changes

Jump to: navigation, search

TeamDS

3,766 bytes added, 00:25, 12 April 2017
Launch Config GPU Optimized Phase 2
</syntaxhighlight>
=== GPU Optimization Phase 2 ===
For every n, we are calculating the XYCoords n number of times which is a total of n^2 times. Since the XYCoord of pixels are fixed for every pixel, we can pre generate XYCoord arrays to be mapped by a single array index. However, this will increase the GPU's access to global memory. We will need to benchmark and see if this will give better times or not.
 
=== GenerateXYCoord Kernel ===
 
<syntaxhighlight lang="cpp">
__global__ void GenerateXYCoord(int xCoord[], int yCoord[], int width, int height)
{
int size = width * height;
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= size)
return;
 
int y = (i / width);
xCoord[i] = i - (y * width);
yCoord[i] = y;
}
 
</syntaxhighlight >
 
=== SDFGenerateCuda Kernel Optimized Phase 2 ===
 
<syntaxhighlight lang="cpp">
 
__global__ void SDFGenerateCuda(const float src[], float dst[], const int xCoord[], const int yCoord[], int size, int spread)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= size)
return;
 
// Used for avoiding unnecessary sqrt calc.
// Just compare the two sqaured distances and
// only use sqrt if it is the shorest distance
float shortestDistSquared = MAX_FLOAT_VALUE;
 
float pixelVal = src[i];
if (pixelVal > 0) // It's an inside pixel
{
// Find closest outside pixel
for (int j = 0; j < size; j++)
{
float pixelVal2 = src[j];
if (pixelVal2 == 0)// Outside pixel
{
// Calculate distance
float dx = xCoord[j] - xCoord[i];
float dy = yCoord[j] - yCoord[i];
float distSqured = dx * dx + dy * dy;
if (distSqured < shortestDistSquared) shortestDistSquared = distSqured;
}
}
float shortestDist =sqrtf(shortestDistSquared); float spread01 =(shortestDist / spread); if (spread01 > 1) spread01 = 1; // clamp it dst[i] = (spread01 * .5f) + 0.5f; } else // It's an outisde pixel { // Find closest inside pixel for (int j = 0; j < size; j++) { float pixelVal2 = src[j]; if (pixelVal2 > 0)// Inside pixel { // Calculate distance float dx = xCoord[j] - xCoord[i]; float dy = yCoord[j] - yCoord[i]; float distSqured = dx * dx + dy * dy; if (distSqured < shortestDistSquared) shortestDistSquared = distSqured; } }  float shortestDist = sqrtf(shortestDistSquared); float spread01 = (shortestDist / spread); if (spread01 > 1) spread01 = 1; // clamp it dst[i] = (1 - spread01) *.5f; } } </syntaxhighlight >  === Launch Config GPU Optimization Optimized Phase 2 ===For every n<syntaxhighlight lang="cpp">int main(int argc, we are calculating the XYCoords n^char **argv){ if (argc != 2 ) { cout << "Incorrect number of times which is a lotarg"; return 1; }  char* path = argv[1]; BinaryBitmap* bitmap = LoadBitmap(path); if (bitmap == NULL) return 1;   int d; cudaDeviceProp prop; cudaGetDevice(&d); cudaGetDeviceProperties(&prop, d); unsigned ntpb = prop. Since the XYCoord of pixels are fixed maxThreadsDim[0];  int size = bitmap->GetSize(); int width = bitmap->_width; int height = bitmap->_height;  // Allocate GPU memory float* d_src; float * d_dst; int* d_xCoord; int* d_yCoord; cudaMalloc((void**)&d_src, size * sizeof(float)); cudaMalloc((void**)&d_dst, size * sizeof(float)); cudaMalloc((void**)&d_xCoord, size * sizeof(int)); cudaMalloc((void**)&d_yCoord, size * sizeof(int));   // Copy src to device src cudaMemcpy(d_src, bitmap->_pixels, size * sizeof(float), cudaMemcpyHostToDevice);  // Prepare kernal launch // Calc how many blocks to launch int numOfBlocks = ((size + ntpb - 1) / ntpb);   // Launch grid for pre-calculating XYCoords GenerateXYCoord << <numOfBlocks, ntpb >> >(d_xCoord, d_yCoord, width, height);  // Launch grid for every pixelconverting SDFGenerateCuda << <numOfBlocks, ntpb >> >(d_src, d_dst, d_xCoord, d_yCoord, size, we can pre generate XYCoord arrays 64); // Wait for kernel to a single finish before copying cudaDeviceSynchronize();  // buffer array. Howeverfor SDF pixels float* dst = new float[bitmap->GetSize()]; cudaMemcpy(dst, d_dst, size * sizeof(float), cudaMemcpyDeviceToHost);   SaveBitmap(path, this will increase the GPU's access to global dst);  Pause();  // Free memory. We will need to benchmark and see if this will give better times or not.back cudaFree(d_src); cudaFree(d_dst); cudaFree(d_xCoord); cudaFree(d_yCoord); delete bitmap; delete dst;  return 0; }</syntaxhighlight >
116
edits

Navigation menu