Open main menu

CDOT Wiki β

Changes

TeamDS

10,496 bytes added, 00:25, 12 April 2017
Launch Config GPU Optimized Phase 2
===Bench Marks===
Flat profile:===Converting a 128x128 image===
Each sample counts as 0[[File:a1_128.01 secondspng]] ----  ===Converting a 256x256 image=== [[File:a1_256.png]] % cumulative self self total time seconds seconds calls Ts/call Ts/call name ----  ===Converting a 512x512 image=== 100[[File:a1_512.00 6png]] ---- === Analysis ===As you can see, it gets much more expensive as the image size increases.90 6This is due to the n^2 algorithm.90 Generate(float const*, float*, int, intIn order to to be able to convert images to SDF for game engine in reasonable amount of time, int) 0.00 6.90 0.00 1 0.00 0we need to look into GPU parallelism.00 _GLOBAL__sub_I__Z5Pausev
=== Assignment 2 ===
 
===Converting the code to Cuda===
 
For now, this code will be a direct conversion to Cuda code without any special consideration for GPU related optimizations.
 
=== Launch Config ===
<syntaxhighlight lang="cpp">
int main(int argc, char **argv)
{
if (argc != 2)
{
cout << "Incorrect number of arg";
return 0;
}
 
char* path = argv[1];
BinaryBitmap* bitmap = LoadBitmap(path);
if (bitmap == NULL)
return 0;
 
 
int d;
cudaDeviceProp prop;
cudaGetDevice(&d);
cudaGetDeviceProperties(&prop, d);
unsigned ntpb = prop.maxThreadsDim[0];
 
int size = bitmap->GetSize();
int width = bitmap->_width;
int height = bitmap->_height;
 
// Allocate GPU memory
float* d_src;
float * d_dst;
cudaMalloc((void**)&d_src, size * sizeof(float));
cudaMalloc((void**)&d_dst, size * sizeof(float));
 
// Copy src to device src
cudaMemcpy(d_src, bitmap->_pixels, size * sizeof(float), cudaMemcpyHostToDevice);
 
// Prepare kernal launch
// Calc how many blocks to launch
int numOfBlocks = ((size + ntpb - 1) / ntpb); //(size / ntpb) + 1;
// Launch grid
SDFGenerateCuda << <numOfBlocks, ntpb >> >(d_src, d_dst, width, height, 64);
 
// Wait for kernel to finish before copying
cudaDeviceSynchronize();
 
// buffer array for SDF pixels
float* dst = new float[bitmap->GetSize()];
cudaMemcpy(dst, d_dst, size * sizeof(float), cudaMemcpyDeviceToHost);
 
 
SaveBitmap(path, dst);
 
Pause();
 
// Free memory back
cudaFree(d_src);
cudaFree(d_dst);
delete bitmap;
delete dst;
 
return 1;
 
}
</syntaxhighlight>
 
=== The Kernel ===
<syntaxhighlight lang="cpp">
__global__ void SDFGenerateCuda(const float src[], float dst[], int width, int height, int spread)
{
int size = width * height;
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= size)
return;
 
 
//Vector2 localVec(i - ((i / width) * width), i / width);
float lx = i - ((i / width) * width);
float ly = i / width;
 
float shortestDist = MAX_FLOAT_VALUE;
float pixelVal = src[i];
if (pixelVal > 0) // It's an inside pixel
{
// Find closest outside pixel
for (int j = 0; j < size; j++)
{
float pixelVal2 = src[j];
if (pixelVal2 == 0)// Outside pixel
{
// Calculate distance
//Vector2 targetVec(j - ((j / width) * width), j / width);
//float dist = localVec.CalcDistance(targetVec);
 
float tx = j - ((j / width) * width);
float ty = j / width;
 
float dx = tx - lx;
float dy = ty - ly;
float dist = sqrtf(dx * dx + dy * dy);
if (dist < shortestDist) shortestDist = dist;
}
 
}
 
float spread01 = (shortestDist / spread);
if (spread01 > 1) spread01 = 1; // clamp it
dst[i] = (spread01 * .5f) + 0.5f;
}
else // It's an outisde pixel
{
// Find closest inside pixel
for (int j = 0; j < size; j++)
{
float pixelVal2 = src[j];
if (pixelVal2 > 0)// Inside pixel
{
// Calculate distance
//Vector2 targetVec(j - ((j / width) * width), j / width);
//float dist = localVec.CalcDistance(targetVec);
 
float tx = j - ((j / width) * width);
float ty = j / width;
 
float dx = tx - lx;
float dy = ty - ly;
float dist = sqrtf(dx * dx + dy * dy);
if (dist < shortestDist) shortestDist = dist;
}
}
 
float spread01 = (shortestDist / spread);
if (spread01 > 1) spread01 = 1; // clamp it
dst[i] = (1 - spread01) *.5f;
}
 
}
 
</syntaxhighlight>
 
 
=== GPU Bench Marks ===
 
=== Converting 128x128 Image on GPU ===
 
Converting took less than one second. Serial CPU took 6.5 seconds.
 
=== Converting 256x256 Image on GPU ===
 
Converting took 1 seconds. Serial CPU took 110 seconds.
 
=== Converting 512x512 Image on GPU ===
 
Converting took less than 8 seconds. Serial CPU took about 30 minutes. '''However the GPU driver gives a message that GPU has stopped responding. I think the GPU stops the program.'''
 
 
=== Converting 1024x1024 Image on GPU ===
 
Converting took less than 8 seconds. Will take probably hours in CPU serial. '''However the GPU driver gives a message that GPU has stopped responding. I think the GPU stops the program.'''
 
=== Assignment 3 ===
 
=== GPU Optimization Phase 1 ===
 
First step of optimizing is by finding a way to use less sqrtf function. It turns out that we are using lots of sqrtf for comparing distances to see if one is larger than the other. However, when squaring two numbers, it does not change which is the larger number. We can take advantage of this by comparing the squared distance, not the actual distance. This way we only need to use sqrtf once per pixel! Below is the optimized kernel:
 
<syntaxhighlight lang="cpp">
 
__global__ void SDFGenerateCuda(const float src[], float dst[], int width, int height, int spread)
{
int size = width * height;
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= size)
return;
float lx = i - ((i / width) * width);
float ly = i / width;
 
 
// Used for avoiding unnecessary sqrt calc.
// Just compare the two sqaured distances and
// only use sqrt if it is the shorest distance
float shortestDistSqured = MAX_FLOAT_VALUE;
 
float pixelVal = src[i];
if (pixelVal > 0) // It's an inside pixel
{
// Find closest outside pixel
for (int j = 0; j < size; j++)
{
float pixelVal2 = src[j];
if (pixelVal2 == 0)// Outside pixel
{
// Calculate distance
float tx = j - ((j / width) * width);
float ty = j / width;
 
float dx = tx - lx;
float dy = ty - ly;
float distSqured = dx * dx + dy * dy;
if (distSqured < shortestDistSqured) shortestDistSqured = distSqured;
}
 
}
 
float shortestDist = sqrtf(shortestDistSqured);
float spread01 = (shortestDist / spread);
if (spread01 > 1) spread01 = 1; // clamp it
dst[i] = (spread01 * .5f) + 0.5f;
}
else // It's an outisde pixel
{
// Find closest inside pixel
for (int j = 0; j < size; j++)
{
float pixelVal2 = src[j];
if (pixelVal2 > 0)// Inside pixel
{
// Calculate distance
float tx = j - ((j / width) * width);
float ty = j / width;
 
float dx = tx - lx;
float dy = ty - ly;
float distSqured = dx * dx + dy * dy;
if (distSqured < shortestDistSqured) shortestDistSqured = distSqured;
}
}
 
float shortestDist = sqrtf(shortestDistSqured);
float spread01 = (shortestDist / spread);
if (spread01 > 1) spread01 = 1; // clamp it
dst[i] = (1 - spread01) *.5f;
}
 
}
 
 
</syntaxhighlight>
 
=== GPU Optimization Phase 2 ===
For every n, we are calculating the XYCoords n number of times which is a total of n^2 times. Since the XYCoord of pixels are fixed for every pixel, we can pre generate XYCoord arrays to be mapped by a single array index. However, this will increase the GPU's access to global memory. We will need to benchmark and see if this will give better times or not.
 
=== GenerateXYCoord Kernel ===
 
<syntaxhighlight lang="cpp">
__global__ void GenerateXYCoord(int xCoord[], int yCoord[], int width, int height)
{
int size = width * height;
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= size)
return;
 
int y = (i / width);
xCoord[i] = i - (y * width);
yCoord[i] = y;
}
 
</syntaxhighlight >
 
=== SDFGenerateCuda Kernel Optimized Phase 2 ===
 
<syntaxhighlight lang="cpp">
 
__global__ void SDFGenerateCuda(const float src[], float dst[], const int xCoord[], const int yCoord[], int size, int spread)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= size)
return;
 
// Used for avoiding unnecessary sqrt calc.
// Just compare the two sqaured distances and
// only use sqrt if it is the shorest distance
float shortestDistSquared = MAX_FLOAT_VALUE;
 
float pixelVal = src[i];
if (pixelVal > 0) // It's an inside pixel
{
// Find closest outside pixel
for (int j = 0; j < size; j++)
{
float pixelVal2 = src[j];
if (pixelVal2 == 0)// Outside pixel
{
// Calculate distance
float dx = xCoord[j] - xCoord[i];
float dy = yCoord[j] - yCoord[i];
float distSqured = dx * dx + dy * dy;
if (distSqured < shortestDistSquared) shortestDistSquared = distSqured;
}
 
}
 
float shortestDist = sqrtf(shortestDistSquared);
float spread01 = (shortestDist / spread);
if (spread01 > 1) spread01 = 1; // clamp it
dst[i] = (spread01 * .5f) + 0.5f;
}
else // It's an outisde pixel
{
// Find closest inside pixel
for (int j = 0; j < size; j++)
{
float pixelVal2 = src[j];
if (pixelVal2 > 0)// Inside pixel
{
// Calculate distance
float dx = xCoord[j] - xCoord[i];
float dy = yCoord[j] - yCoord[i];
float distSqured = dx * dx + dy * dy;
if (distSqured < shortestDistSquared) shortestDistSquared = distSqured;
}
}
 
float shortestDist = sqrtf(shortestDistSquared);
float spread01 = (shortestDist / spread);
if (spread01 > 1) spread01 = 1; // clamp it
dst[i] = (1 - spread01) *.5f;
}
 
}
 
</syntaxhighlight >
 
 
=== Launch Config GPU Optimized Phase 2 ===
 
<syntaxhighlight lang="cpp">
int main(int argc, char **argv)
{
if (argc != 2)
{
cout << "Incorrect number of arg";
return 1;
}
 
char* path = argv[1];
BinaryBitmap* bitmap = LoadBitmap(path);
if (bitmap == NULL)
return 1;
 
 
int d;
cudaDeviceProp prop;
cudaGetDevice(&d);
cudaGetDeviceProperties(&prop, d);
unsigned ntpb = prop.maxThreadsDim[0];
 
int size = bitmap->GetSize();
int width = bitmap->_width;
int height = bitmap->_height;
 
// Allocate GPU memory
float* d_src;
float * d_dst;
int* d_xCoord;
int* d_yCoord;
cudaMalloc((void**)&d_src, size * sizeof(float));
cudaMalloc((void**)&d_dst, size * sizeof(float));
cudaMalloc((void**)&d_xCoord, size * sizeof(int));
cudaMalloc((void**)&d_yCoord, size * sizeof(int));
 
 
// Copy src to device src
cudaMemcpy(d_src, bitmap->_pixels, size * sizeof(float), cudaMemcpyHostToDevice);
 
// Prepare kernal launch
// Calc how many blocks to launch
int numOfBlocks = ((size + ntpb - 1) / ntpb);
 
 
// Launch grid for pre-calculating XYCoords
GenerateXYCoord << <numOfBlocks, ntpb >> >(d_xCoord, d_yCoord, width, height);
 
// Launch grid for converting
SDFGenerateCuda << <numOfBlocks, ntpb >> >(d_src, d_dst, d_xCoord, d_yCoord, size, 64);
// Wait for kernel to finish before copying
cudaDeviceSynchronize();
 
// buffer array for SDF pixels
float* dst = new float[bitmap->GetSize()];
cudaMemcpy(dst, d_dst, size * sizeof(float), cudaMemcpyDeviceToHost);
 
 
SaveBitmap(path, dst);
 
Pause();
 
// Free memory back
cudaFree(d_src);
cudaFree(d_dst);
cudaFree(d_xCoord);
cudaFree(d_yCoord);
delete bitmap;
delete dst;
 
return 0;
 
}
</syntaxhighlight >
116
edits