Changes

← Older edit

TeamDS

3,766 bytes added, 00:25, 12 April 2017

→‎Launch Config GPU Optimized Phase 2

</syntaxhighlight>

=== GPU Optimization Phase 2 ===

For every n, we are calculating the XYCoords n number of times which is a total of n^2 times. Since the XYCoord of pixels are fixed for every pixel, we can pre generate XYCoord arrays to be mapped by a single array index. However, this will increase the GPU's access to global memory. We will need to benchmark and see if this will give better times or not.

=== GenerateXYCoord Kernel ===

__global__ void GenerateXYCoord(int xCoord[], int yCoord[], int width, int height)

{

int size = width * height;

int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i >= size)

return;

int y = (i / width);

xCoord[i] = i - (y * width);

yCoord[i] = y;

}

</syntaxhighlight >

=== SDFGenerateCuda Kernel Optimized Phase 2 ===

__global__ void SDFGenerateCuda(const float src[], float dst[], const int xCoord[], const int yCoord[], int size, int spread)

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i >= size)

return;

// Used for avoiding unnecessary sqrt calc.

// Just compare the two sqaured distances and

// only use sqrt if it is the shorest distance

float shortestDistSquared = MAX_FLOAT_VALUE;

float pixelVal = src[i];

if (pixelVal > 0) // It's an inside pixel

{

// Find closest outside pixel

for (int j = 0; j < size; j++)

{

float pixelVal2 = src[j];

if (pixelVal2 == 0)// Outside pixel

{

// Calculate distance

float dx = xCoord[j] - xCoord[i];

float dy = yCoord[j] - yCoord[i];

float distSqured = dx * dx + dy * dy;

if (distSqured < shortestDistSquared) shortestDistSquared = distSqured;

}

float shortestDist =sqrtf(shortestDistSquared); float spread01 =(shortestDist / spread); if (spread01 > 1) spread01 = 1; // clamp it dst[i] = (spread01 * .5f) + 0.5f; } else // It's an outisde pixel { // Find closest inside pixel for (int j = 0; j < size; j++) { float pixelVal2 = src[j]; if (pixelVal2 > 0)// Inside pixel { // Calculate distance float dx = xCoord[j] - xCoord[i]; float dy = yCoord[j] - yCoord[i]; float distSqured = dx * dx + dy * dy; if (distSqured < shortestDistSquared) shortestDistSquared = distSqured; } } float shortestDist = sqrtf(shortestDistSquared); float spread01 = (shortestDist / spread); if (spread01 > 1) spread01 = 1; // clamp it dst[i] = (1 - spread01) *.5f; } } </syntaxhighlight > === Launch Config GPU ~~Optimization~~ Optimized Phase 2 ===~~For every n~~<syntaxhighlight lang="cpp">int main(int argc, ~~we are calculating the XYCoords n^~~char **argv){ if (argc != 2 ) { cout << "Incorrect number of ~~times which is a lot~~arg"; return 1; } char* path = argv[1]; BinaryBitmap* bitmap = LoadBitmap(path); if (bitmap == NULL) return 1; int d; cudaDeviceProp prop; cudaGetDevice(&d); cudaGetDeviceProperties(&prop, d); unsigned ntpb = prop. ~~Since the XYCoord of pixels are fixed~~ maxThreadsDim[0]; int size = bitmap->GetSize(); int width = bitmap->_width; int height = bitmap->_height; // Allocate GPU memory float* d_src; float * d_dst; int* d_xCoord; int* d_yCoord; cudaMalloc((void**)&d_src, size * sizeof(float)); cudaMalloc((void**)&d_dst, size * sizeof(float)); cudaMalloc((void**)&d_xCoord, size * sizeof(int)); cudaMalloc((void**)&d_yCoord, size * sizeof(int)); // Copy src to device src cudaMemcpy(d_src, bitmap->_pixels, size * sizeof(float), cudaMemcpyHostToDevice); // Prepare kernal launch // Calc how many blocks to launch int numOfBlocks = ((size + ntpb - 1) / ntpb); // Launch grid for pre-calculating XYCoords GenerateXYCoord << <numOfBlocks, ntpb >> >(d_xCoord, d_yCoord, width, height); // Launch grid for ~~every pixel~~converting SDFGenerateCuda << <numOfBlocks, ntpb >> >(d_src, d_dst, d_xCoord, d_yCoord, size, ~~we can pre generate XYCoord arrays~~ 64); // Wait for kernel to ~~a single~~ finish before copying cudaDeviceSynchronize(); // buffer array~~. However~~for SDF pixels float* dst = new float[bitmap->GetSize()]; cudaMemcpy(dst, d_dst, size * sizeof(float), cudaMemcpyDeviceToHost); SaveBitmap(path, ~~this will increase the GPU's access to global~~ dst); Pause(); // Free memory~~. We will need to benchmark and see if this will give better times or not.~~back cudaFree(d_src); cudaFree(d_dst); cudaFree(d_xCoord); cudaFree(d_yCoord); delete bitmap; delete dst; return 0; }</syntaxhighlight >

Dshirzad

116

edits

Changes

TeamDS

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools