Difference between revisions of "TeamDS"

From CDOT Wiki
Jump to: navigation, search
(Bench Marks)
(Launch Config GPU Optimized Phase 2)
 
(40 intermediate revisions by the same user not shown)
Line 105: Line 105:
  
 
===Bench Marks===
 
===Bench Marks===
'''Image 256x256'''
 
Flat profile:
 
  
Each sample counts as 0.01 seconds.
+
===Converting a 128x128 image===
  %  cumulative  self              self    total         
+
 
time  seconds  seconds    calls  Ts/call  Ts/call  name   
+
[[File:a1_128.png]]
100.00      6.90    6.90                            Generate(float const*, float*, int, int, int)
+
 
  0.00      6.90    0.00        1    0.00    0.00 _GLOBAL__sub_I__Z5Pausev
+
----
 +
 
 +
 
 +
===Converting a 256x256 image===
 +
 
 +
[[File:a1_256.png]]
 +
 
 +
----
 +
 
 +
 
 +
===Converting a 512x512 image===
 +
 
 +
[[File:a1_512.png]]
 +
 
 +
----
 +
 
 +
=== Analysis ===
 +
As you can see, it gets much more expensive as the image size increases. This is due to the n^2 algorithm. In order to to be able to convert images to SDF for game engine in reasonable amount of time, we need to look into GPU parallelism.   
  
 
=== Assignment 2 ===
 
=== Assignment 2 ===
 +
 +
===Converting the code to Cuda===
 +
 +
For now, this code will be a direct conversion to Cuda code without any special consideration for GPU related optimizations.
 +
 +
=== Launch Config ===
 +
<syntaxhighlight lang="cpp">
 +
int main(int argc, char **argv)
 +
{
 +
if (argc != 2)
 +
{
 +
cout << "Incorrect number of arg";
 +
return 0;
 +
}
 +
 +
char* path = argv[1];
 +
BinaryBitmap* bitmap = LoadBitmap(path);
 +
if (bitmap == NULL)
 +
return 0;
 +
 +
 +
int d;
 +
cudaDeviceProp prop;
 +
cudaGetDevice(&d);
 +
cudaGetDeviceProperties(&prop, d);
 +
unsigned ntpb = prop.maxThreadsDim[0];
 +
 +
int size = bitmap->GetSize();
 +
int width = bitmap->_width;
 +
int height = bitmap->_height;
 +
 +
// Allocate GPU memory
 +
float* d_src;
 +
float * d_dst;
 +
cudaMalloc((void**)&d_src, size * sizeof(float));
 +
cudaMalloc((void**)&d_dst, size * sizeof(float));
 +
 +
// Copy src to device src
 +
cudaMemcpy(d_src, bitmap->_pixels, size * sizeof(float), cudaMemcpyHostToDevice);
 +
 +
// Prepare kernal launch
 +
// Calc how many blocks to launch
 +
int numOfBlocks =  ((size + ntpb - 1) / ntpb); //(size / ntpb) + 1;
 +
// Launch grid
 +
SDFGenerateCuda << <numOfBlocks, ntpb >> >(d_src, d_dst, width, height, 64);
 +
 +
// Wait for kernel to finish before copying
 +
cudaDeviceSynchronize();
 +
 +
// buffer array for SDF pixels
 +
float* dst = new float[bitmap->GetSize()];
 +
cudaMemcpy(dst, d_dst, size * sizeof(float), cudaMemcpyDeviceToHost);
 +
 +
 +
SaveBitmap(path, dst);
 +
 +
Pause();
 +
 +
// Free memory back
 +
cudaFree(d_src);
 +
cudaFree(d_dst);
 +
delete bitmap;
 +
delete dst;
 +
 +
return 1;
 +
 +
}
 +
</syntaxhighlight>
 +
 +
=== The Kernel ===
 +
<syntaxhighlight lang="cpp">
 +
__global__ void SDFGenerateCuda(const float src[], float dst[], int width, int height, int spread)
 +
{
 +
int size = width * height;
 +
int i = blockIdx.x * blockDim.x + threadIdx.x;
 +
if (i >= size)
 +
return;
 +
 +
 +
//Vector2 localVec(i - ((i / width) * width), i / width);
 +
float lx = i - ((i / width) * width);
 +
float ly = i / width;
 +
 +
float shortestDist = MAX_FLOAT_VALUE;
 +
float pixelVal = src[i];
 +
if (pixelVal > 0) // It's an inside pixel
 +
{
 +
// Find closest outside pixel
 +
for (int j = 0; j < size; j++)
 +
{
 +
float pixelVal2 = src[j];
 +
if (pixelVal2 == 0)// Outside pixel
 +
{
 +
// Calculate distance
 +
//Vector2 targetVec(j - ((j / width) * width), j / width);
 +
//float dist = localVec.CalcDistance(targetVec);
 +
 +
float tx = j - ((j / width) * width);
 +
float ty = j / width;
 +
 +
float dx = tx - lx;
 +
float dy = ty - ly;
 +
float dist = sqrtf(dx * dx + dy * dy);
 +
if (dist < shortestDist) shortestDist = dist;
 +
}
 +
 +
}
 +
 +
float spread01 = (shortestDist / spread);
 +
if (spread01 > 1) spread01 = 1; // clamp it
 +
dst[i] = (spread01 * .5f) + 0.5f;
 +
}
 +
else // It's an outisde pixel
 +
{
 +
// Find closest inside pixel
 +
for (int j = 0; j < size; j++)
 +
{
 +
float pixelVal2 = src[j];
 +
if (pixelVal2 > 0)// Inside pixel
 +
{
 +
// Calculate distance
 +
//Vector2 targetVec(j - ((j / width) * width), j / width);
 +
//float dist = localVec.CalcDistance(targetVec);
 +
 +
float tx = j - ((j / width) * width);
 +
float ty = j / width;
 +
 +
float dx = tx - lx;
 +
float dy = ty - ly;
 +
float dist = sqrtf(dx * dx + dy * dy);
 +
if (dist < shortestDist) shortestDist = dist;
 +
}
 +
}
 +
 +
float spread01 = (shortestDist / spread);
 +
if (spread01 > 1) spread01 = 1; // clamp it
 +
dst[i] = (1 - spread01) *.5f;
 +
}
 +
 +
}
 +
 +
</syntaxhighlight>
 +
 +
 +
=== GPU Bench Marks ===
 +
 +
=== Converting 128x128 Image on GPU ===
 +
 +
Converting took less than one second. Serial CPU took 6.5 seconds.
 +
 +
=== Converting 256x256 Image on GPU ===
 +
 +
Converting took 1 seconds. Serial CPU took 110 seconds.
 +
 +
=== Converting 512x512 Image on GPU ===
 +
 +
Converting took less than 8 seconds. Serial CPU took about 30 minutes. '''However the GPU driver gives a message that GPU has stopped responding. I think the GPU stops the program.'''
 +
 +
 +
=== Converting 1024x1024 Image on GPU ===
 +
 +
Converting took less than 8 seconds. Will take probably hours in CPU serial. '''However the GPU driver gives a message that GPU has stopped responding. I think the GPU stops the program.'''
 +
 
=== Assignment 3 ===
 
=== Assignment 3 ===
 +
 +
=== GPU Optimization Phase 1 ===
 +
 +
First step of optimizing is by finding a way to use less sqrtf function. It turns out that we are using lots of sqrtf for comparing distances to see if one is larger than the other. However, when squaring two numbers, it does not change which is the larger number. We can take advantage of this by comparing the squared distance, not the actual distance. This way we only need to use sqrtf once per pixel! Below is the optimized kernel:
 +
 +
<syntaxhighlight lang="cpp">
 +
 +
__global__ void SDFGenerateCuda(const float src[], float dst[], int width, int height, int spread)
 +
{
 +
int size = width * height;
 +
int i = blockIdx.x * blockDim.x + threadIdx.x;
 +
if (i >= size)
 +
return;
 +
 +
float lx = i - ((i / width) * width);
 +
float ly = i / width;
 +
 +
 +
// Used for avoiding unnecessary sqrt calc.
 +
// Just compare the two sqaured distances and
 +
// only use sqrt if it is the shorest distance
 +
float shortestDistSqured = MAX_FLOAT_VALUE;
 +
 +
float pixelVal = src[i];
 +
if (pixelVal > 0) // It's an inside pixel
 +
{
 +
// Find closest outside pixel
 +
for (int j = 0; j < size; j++)
 +
{
 +
float pixelVal2 = src[j];
 +
if (pixelVal2 == 0)// Outside pixel
 +
{
 +
// Calculate distance
 +
float tx = j - ((j / width) * width);
 +
float ty = j / width;
 +
 +
float dx = tx - lx;
 +
float dy = ty - ly;
 +
float distSqured = dx * dx + dy * dy;
 +
if (distSqured < shortestDistSqured) shortestDistSqured = distSqured;
 +
 +
}
 +
 +
}
 +
 +
float shortestDist = sqrtf(shortestDistSqured);
 +
float spread01 = (shortestDist / spread);
 +
if (spread01 > 1) spread01 = 1; // clamp it
 +
dst[i] = (spread01 * .5f) + 0.5f;
 +
}
 +
else // It's an outisde pixel
 +
{
 +
// Find closest inside pixel
 +
for (int j = 0; j < size; j++)
 +
{
 +
float pixelVal2 = src[j];
 +
if (pixelVal2 > 0)// Inside pixel
 +
{
 +
// Calculate distance
 +
float tx = j - ((j / width) * width);
 +
float ty = j / width;
 +
 +
float dx = tx - lx;
 +
float dy = ty - ly;
 +
float distSqured = dx * dx + dy * dy;
 +
if (distSqured < shortestDistSqured) shortestDistSqured = distSqured;
 +
}
 +
}
 +
 +
float shortestDist = sqrtf(shortestDistSqured);
 +
float spread01 = (shortestDist / spread);
 +
if (spread01 > 1) spread01 = 1; // clamp it
 +
dst[i] = (1 - spread01) *.5f;
 +
}
 +
 +
}
 +
 +
 +
</syntaxhighlight>
 +
 +
=== GPU Optimization Phase 2 ===
 +
For every n, we are calculating the XYCoords n number of times which is a total of n^2 times. Since the XYCoord of pixels are fixed for every pixel, we can pre generate XYCoord arrays to be mapped by a single array index. However, this will increase the GPU's access to global memory. We will need to benchmark and see if this will give better times or not.
 +
 +
=== GenerateXYCoord Kernel ===
 +
 +
<syntaxhighlight lang="cpp">
 +
__global__ void GenerateXYCoord(int xCoord[], int yCoord[], int width, int height)
 +
{
 +
int size = width * height;
 +
int i = blockIdx.x * blockDim.x + threadIdx.x;
 +
if (i >= size)
 +
return;
 +
 +
int y = (i / width);
 +
xCoord[i] = i - (y * width);
 +
yCoord[i] = y;
 +
}
 +
 +
</syntaxhighlight >
 +
 +
=== SDFGenerateCuda Kernel Optimized Phase 2 ===
 +
 +
<syntaxhighlight lang="cpp">
 +
 +
__global__ void SDFGenerateCuda(const float src[], float dst[], const int xCoord[], const int yCoord[], int size, int spread)
 +
{
 +
 +
int i = blockIdx.x * blockDim.x + threadIdx.x;
 +
if (i >= size)
 +
return;
 +
 +
 +
// Used for avoiding unnecessary sqrt calc.
 +
// Just compare the two sqaured distances and
 +
// only use sqrt if it is the shorest distance
 +
float shortestDistSquared = MAX_FLOAT_VALUE;
 +
 +
float pixelVal = src[i];
 +
if (pixelVal > 0) // It's an inside pixel
 +
{
 +
// Find closest outside pixel
 +
for (int j = 0; j < size; j++)
 +
{
 +
float pixelVal2 = src[j];
 +
if (pixelVal2 == 0)// Outside pixel
 +
{
 +
// Calculate distance
 +
float dx = xCoord[j] - xCoord[i];
 +
float dy = yCoord[j] - yCoord[i];
 +
float distSqured = dx * dx + dy * dy;
 +
if (distSqured < shortestDistSquared) shortestDistSquared = distSqured;
 +
 +
}
 +
 +
}
 +
 +
float shortestDist = sqrtf(shortestDistSquared);
 +
float spread01 = (shortestDist / spread);
 +
if (spread01 > 1) spread01 = 1; // clamp it
 +
dst[i] = (spread01 * .5f) + 0.5f;
 +
}
 +
else // It's an outisde pixel
 +
{
 +
// Find closest inside pixel
 +
for (int j = 0; j < size; j++)
 +
{
 +
float pixelVal2 = src[j];
 +
if (pixelVal2 > 0)// Inside pixel
 +
{
 +
// Calculate distance
 +
float dx = xCoord[j] - xCoord[i];
 +
float dy = yCoord[j] - yCoord[i];
 +
float distSqured = dx * dx + dy * dy;
 +
if (distSqured < shortestDistSquared) shortestDistSquared = distSqured;
 +
}
 +
}
 +
 +
float shortestDist = sqrtf(shortestDistSquared);
 +
float spread01 = (shortestDist / spread);
 +
if (spread01 > 1) spread01 = 1; // clamp it
 +
dst[i] = (1 - spread01) *.5f;
 +
}
 +
 +
}
 +
 +
</syntaxhighlight >
 +
 +
 +
=== Launch Config GPU Optimized Phase 2 ===
 +
 +
<syntaxhighlight lang="cpp">
 +
int main(int argc, char **argv)
 +
{
 +
if (argc != 2)
 +
{
 +
cout << "Incorrect number of arg";
 +
return 1;
 +
}
 +
 +
char* path = argv[1];
 +
BinaryBitmap* bitmap = LoadBitmap(path);
 +
if (bitmap == NULL)
 +
return 1;
 +
 +
 +
int d;
 +
cudaDeviceProp prop;
 +
cudaGetDevice(&d);
 +
cudaGetDeviceProperties(&prop, d);
 +
unsigned ntpb = prop.maxThreadsDim[0];
 +
 +
int size = bitmap->GetSize();
 +
int width = bitmap->_width;
 +
int height = bitmap->_height;
 +
 +
// Allocate GPU memory
 +
float* d_src;
 +
float * d_dst;
 +
int* d_xCoord;
 +
int* d_yCoord;
 +
cudaMalloc((void**)&d_src, size * sizeof(float));
 +
cudaMalloc((void**)&d_dst, size * sizeof(float));
 +
cudaMalloc((void**)&d_xCoord, size * sizeof(int));
 +
cudaMalloc((void**)&d_yCoord, size * sizeof(int));
 +
 +
 +
// Copy src to device src
 +
cudaMemcpy(d_src, bitmap->_pixels, size * sizeof(float), cudaMemcpyHostToDevice);
 +
 +
// Prepare kernal launch
 +
// Calc how many blocks to launch
 +
int numOfBlocks = ((size + ntpb - 1) / ntpb);
 +
 +
 +
// Launch grid for pre-calculating XYCoords
 +
GenerateXYCoord << <numOfBlocks, ntpb >> >(d_xCoord, d_yCoord, width, height);
 +
 +
// Launch grid for converting
 +
SDFGenerateCuda << <numOfBlocks, ntpb >> >(d_src, d_dst, d_xCoord, d_yCoord, size, 64);
 +
 +
// Wait for kernel to finish before copying
 +
cudaDeviceSynchronize();
 +
 +
// buffer array for SDF pixels
 +
float* dst = new float[bitmap->GetSize()];
 +
cudaMemcpy(dst, d_dst, size * sizeof(float), cudaMemcpyDeviceToHost);
 +
 +
 +
SaveBitmap(path, dst);
 +
 +
Pause();
 +
 +
// Free memory back
 +
cudaFree(d_src);
 +
cudaFree(d_dst);
 +
cudaFree(d_xCoord);
 +
cudaFree(d_yCoord);
 +
delete bitmap;
 +
delete dst;
 +
 +
return 0;
 +
 +
}
 +
</syntaxhighlight >

Latest revision as of 23:25, 11 April 2017

Signed Distance Field Generator

Team Members

  1. Dawood Shirzada - Developer

Email All

Progress

Assignment 1

What is Signed Distance Field?

Signed Distance Field also know as SDF, is a technique developed by Valve company that uses low resolution textures to display extremely high resolution looking fonts and decals. Valve used SDF in their game engines that run such games as Half-Life 2, Counter-Strike 2 and etc. SDF is so effective that no matter how many times the font or decal is zoomed in, it will always look crisp and sharp while using very small textures. This allows fonts and decals in game to have much higher quality with low memory compare to using regular high resolution textures.

For more detailed information please read Valve's publication: http://www.valvesoftware.com/publications/2007/SIGGRAPH2007_AlphaTestedMagnification.pdf

Examples from Valve

SDF exmaple from Valve.jpeg


Examples in action

https://youtu.be/CGZRHJvJYIg?t=40


How does Signed Distance Field work?

SDF ONLY works with monochromatic images such as decals and fonts. SDF takes the original texture as an input and creates a SDF version of that texture and saves it in an image format. The game engine then uses the low resolution SDF texture instead.


SDFvsOJ.png


SDF version is actually much different than the original image. the SDF version no longer stores the pixel color intensity like normal images do, but instead stores the distance to nearest opposite color. For example, since monochromatic images only have black and white, for every white pixel, you look for the nearest black pixel and stores the distance between the two and vice versa.

To read the SDF version of image, we will need to use a custom shader that can understand the SDF version. With help of custom shaders, we can do many more effects such as edge glows, drop shadows, soft edges and etc. All these effects at virtually no additional rendering costs. SDF is a huge win when it comes to gaming performance!

How to convert image to SDF version?

It turns out to convert a image to SDF, it is very computationally expensive. There are however, many methods that approximates and are relatively fast, but the Brute-Force method produces the most accurate result and it is the method that Valve used for their textures. Therefore, I will be using this method as well.

Big-O Complexity

For every pixel in a image, we will need to test it against every other pixel. This makes its complexity O(n^2). For example a 256x256 has 65,536 pixel. Each pixel would have to be tested against 65536 pixels to find out the nearest corresponding pixel. So it needs 65,536 * 65,536 = 4,294,967,296 array element look ups!

Why I chose this

I chose SDF image conversion because it has lots of potential for parallelization. Since all the operations of a pixel are independent of each other and reads data from one single array, this allows for massive gains when using GPU multi-threading.


SDF Brute-Force Method

void Generate(const float src[], float dst[], int width, int height, int spread)
{
	
	int spreadSize = spread * spread;
	int size = width * height;
	for (int i = 0; i < size; i++)
	{
		Vector2 localVec(i - ((i / width) * width), i / width);
		float shortestDist = MAX_FLOAT_VALUE;
		float pixelVal = src[i];
		if (pixelVal > 0) // It's an inside pixel
		{
			// Find closest outside pixel
			for (int j = 0; j < size; j++)
			{
				float pixelVal2 = src[j];
				if (pixelVal2 == 0)// Outside pixel
				{
					// Calculate distance
					Vector2 targetVec(j - ((j / width) * width), j / width);
					float dist = localVec.CalcDistance(targetVec);
					if (dist < shortestDist) shortestDist = dist;
				}

			}

			float spread01 = (shortestDist / spread);
			if (spread01 > 1) spread01 = 1; // clamp it
			dst[i] = (spread01 * .5f) + 0.5f;
		}
		else // It's an outisde pixel
		{
			// Find closest inside pixel
			for (int j = 0; j < size; j++)
			{
				float pixelVal2 = src[j];
				if (pixelVal2 > 0)// Inside pixel
				{
					// Calculate distance
					Vector2 targetVec(j - ((j / width) * width), j / width);
					float dist = localVec.CalcDistance(targetVec);
					if (dist < shortestDist) shortestDist = dist;
				}
			}

			float spread01 = (shortestDist / spread);
			if (spread01 > 1) spread01 = 1; // clamp it
			dst[i] = (1 - spread01) *.5f;
		}
	}
}


Bench Marks

Converting a 128x128 image

A1 128.png



Converting a 256x256 image

A1 256.png



Converting a 512x512 image

A1 512.png


Analysis

As you can see, it gets much more expensive as the image size increases. This is due to the n^2 algorithm. In order to to be able to convert images to SDF for game engine in reasonable amount of time, we need to look into GPU parallelism.

Assignment 2

Converting the code to Cuda

For now, this code will be a direct conversion to Cuda code without any special consideration for GPU related optimizations.

Launch Config

int main(int argc, char **argv)
{
	if (argc != 2)
	{
		cout << "Incorrect number of arg";
		return 0;
	}

	char* path = argv[1];
	BinaryBitmap* bitmap = LoadBitmap(path);
	if (bitmap == NULL)
		return 0;


	int d;
	cudaDeviceProp prop;
	cudaGetDevice(&d);
	cudaGetDeviceProperties(&prop, d);
	unsigned ntpb = prop.maxThreadsDim[0];

	int size = bitmap->GetSize();
	int width = bitmap->_width;
	int height = bitmap->_height;

	// Allocate GPU memory
	float* d_src;
	float * d_dst;
	cudaMalloc((void**)&d_src, size * sizeof(float));
	cudaMalloc((void**)&d_dst, size * sizeof(float));

	// Copy src to device src
	cudaMemcpy(d_src, bitmap->_pixels, size * sizeof(float), cudaMemcpyHostToDevice);

	// Prepare kernal launch
	// Calc how many blocks to launch
	int numOfBlocks =  ((size + ntpb - 1) / ntpb); //(size / ntpb) + 1;
	// Launch grid
	SDFGenerateCuda << <numOfBlocks, ntpb >> >(d_src, d_dst, width, height, 64);

	// Wait for kernel to finish before copying
	cudaDeviceSynchronize();

	// buffer array for SDF pixels
	float* dst = new float[bitmap->GetSize()];
	cudaMemcpy(dst, d_dst, size * sizeof(float), cudaMemcpyDeviceToHost);


	SaveBitmap(path, dst);

	Pause();

	// Free memory back
	cudaFree(d_src);
	cudaFree(d_dst);
	delete bitmap;
	delete dst;

	return 1;

}

The Kernel

 __global__ void SDFGenerateCuda(const float src[], float dst[], int width, int height, int spread)
{
	int size = width * height;
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i >= size)
		return;


	//Vector2 localVec(i - ((i / width) * width), i / width);
	float lx = i - ((i / width) * width);
	float ly = i / width;

	float shortestDist = MAX_FLOAT_VALUE;
	float pixelVal = src[i];
	if (pixelVal > 0) // It's an inside pixel
	{
		// Find closest outside pixel
		for (int j = 0; j < size; j++)
		{
			float pixelVal2 = src[j];
			if (pixelVal2 == 0)// Outside pixel
			{
				// Calculate distance
				//Vector2 targetVec(j - ((j / width) * width), j / width);
				//float dist = localVec.CalcDistance(targetVec);

				float tx = j - ((j / width) * width);
				float ty = j / width;

				float dx = tx - lx;
				float dy = ty - ly;
				float dist = sqrtf(dx * dx + dy * dy);
				if (dist < shortestDist) shortestDist = dist;
			}

		}

		float spread01 = (shortestDist / spread);
		if (spread01 > 1) spread01 = 1; // clamp it
		dst[i] = (spread01 * .5f) + 0.5f;
	}
	else // It's an outisde pixel
	{
		// Find closest inside pixel
		for (int j = 0; j < size; j++)
		{
			float pixelVal2 = src[j];
			if (pixelVal2 > 0)// Inside pixel
			{
				// Calculate distance
				//Vector2 targetVec(j - ((j / width) * width), j / width);
				//float dist = localVec.CalcDistance(targetVec);

				float tx = j - ((j / width) * width);
				float ty = j / width;

				float dx = tx - lx;
				float dy = ty - ly;
				float dist = sqrtf(dx * dx + dy * dy);
				if (dist < shortestDist) shortestDist = dist;
			}
		}

		float spread01 = (shortestDist / spread);
		if (spread01 > 1) spread01 = 1; // clamp it
		dst[i] = (1 - spread01) *.5f;
	}

}


GPU Bench Marks

Converting 128x128 Image on GPU

Converting took less than one second. Serial CPU took 6.5 seconds.

Converting 256x256 Image on GPU

Converting took 1 seconds. Serial CPU took 110 seconds.

Converting 512x512 Image on GPU

Converting took less than 8 seconds. Serial CPU took about 30 minutes. However the GPU driver gives a message that GPU has stopped responding. I think the GPU stops the program.


Converting 1024x1024 Image on GPU

Converting took less than 8 seconds. Will take probably hours in CPU serial. However the GPU driver gives a message that GPU has stopped responding. I think the GPU stops the program.

Assignment 3

GPU Optimization Phase 1

First step of optimizing is by finding a way to use less sqrtf function. It turns out that we are using lots of sqrtf for comparing distances to see if one is larger than the other. However, when squaring two numbers, it does not change which is the larger number. We can take advantage of this by comparing the squared distance, not the actual distance. This way we only need to use sqrtf once per pixel! Below is the optimized kernel:

__global__ void SDFGenerateCuda(const float src[], float dst[], int width, int height, int spread)
{
	int size = width * height;
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i >= size)
		return;
	
	float lx = i - ((i / width) * width);
	float ly = i / width;


	// Used for avoiding unnecessary sqrt calc. 
	// Just compare the two sqaured distances and 
	// only use sqrt if it is the shorest distance
	float shortestDistSqured = MAX_FLOAT_VALUE;

	float pixelVal = src[i];
	if (pixelVal > 0) // It's an inside pixel
	{
		// Find closest outside pixel
		for (int j = 0; j < size; j++)
		{
			float pixelVal2 = src[j];
			if (pixelVal2 == 0)// Outside pixel
			{
				// Calculate distance
				float tx = j - ((j / width) * width);
				float ty = j / width;

				float dx = tx - lx;
				float dy = ty - ly;
				float distSqured = dx * dx + dy * dy;
				if (distSqured < shortestDistSqured) shortestDistSqured = distSqured;
				
			}

		}

		float shortestDist = sqrtf(shortestDistSqured);
		float spread01 = (shortestDist / spread);
		if (spread01 > 1) spread01 = 1; // clamp it
		dst[i] = (spread01 * .5f) + 0.5f;
	}
	else // It's an outisde pixel
	{
		// Find closest inside pixel
		for (int j = 0; j < size; j++)
		{
			float pixelVal2 = src[j];
			if (pixelVal2 > 0)// Inside pixel
			{
				// Calculate distance
				float tx = j - ((j / width) * width);
				float ty = j / width;

				float dx = tx - lx;
				float dy = ty - ly;
				float distSqured = dx * dx + dy * dy;
				if (distSqured < shortestDistSqured) shortestDistSqured = distSqured;
			}
		}

		float shortestDist = sqrtf(shortestDistSqured);
		float spread01 = (shortestDist / spread);
		if (spread01 > 1) spread01 = 1; // clamp it
		dst[i] = (1 - spread01) *.5f;
	}

}

GPU Optimization Phase 2

For every n, we are calculating the XYCoords n number of times which is a total of n^2 times. Since the XYCoord of pixels are fixed for every pixel, we can pre generate XYCoord arrays to be mapped by a single array index. However, this will increase the GPU's access to global memory. We will need to benchmark and see if this will give better times or not.

GenerateXYCoord Kernel

__global__ void GenerateXYCoord(int xCoord[], int yCoord[], int width, int height)
{
	int size = width * height;
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i >= size)
		return;

	int y = (i / width);
	xCoord[i] = i - (y * width);
	yCoord[i] = y;
}

SDFGenerateCuda Kernel Optimized Phase 2

__global__ void SDFGenerateCuda(const float src[], float dst[], const int xCoord[], const int yCoord[], int size, int spread)
{
	
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i >= size)
		return;
	

	// Used for avoiding unnecessary sqrt calc. 
	// Just compare the two sqaured distances and 
	// only use sqrt if it is the shorest distance
	float shortestDistSquared = MAX_FLOAT_VALUE;

	float pixelVal = src[i];
	if (pixelVal > 0) // It's an inside pixel
	{
		// Find closest outside pixel
		for (int j = 0; j < size; j++)
		{
			float pixelVal2 = src[j];
			if (pixelVal2 == 0)// Outside pixel
			{
				// Calculate distance
				float dx = xCoord[j] - xCoord[i];
				float dy = yCoord[j] - yCoord[i];
				float distSqured = dx * dx + dy * dy;
				if (distSqured < shortestDistSquared) shortestDistSquared = distSqured;
				
			}

		}

		float shortestDist = sqrtf(shortestDistSquared);
		float spread01 = (shortestDist / spread);
		if (spread01 > 1) spread01 = 1; // clamp it
		dst[i] = (spread01 * .5f) + 0.5f;
	}
	else // It's an outisde pixel
	{
		// Find closest inside pixel
		for (int j = 0; j < size; j++)
		{
			float pixelVal2 = src[j];
			if (pixelVal2 > 0)// Inside pixel
			{
				// Calculate distance
				float dx = xCoord[j] - xCoord[i];
				float dy = yCoord[j] - yCoord[i];
				float distSqured = dx * dx + dy * dy;
				if (distSqured < shortestDistSquared) shortestDistSquared = distSqured;
			}
		}

		float shortestDist = sqrtf(shortestDistSquared);
		float spread01 = (shortestDist / spread);
		if (spread01 > 1) spread01 = 1; // clamp it
		dst[i] = (1 - spread01) *.5f;
	}

}


Launch Config GPU Optimized Phase 2

int main(int argc, char **argv)
{
	if (argc != 2)
	{
		cout << "Incorrect number of arg";
		return 1;
	}

	char* path = argv[1];
	BinaryBitmap* bitmap = LoadBitmap(path);
	if (bitmap == NULL)
		return 1;


	int d;
	cudaDeviceProp prop;
	cudaGetDevice(&d);
	cudaGetDeviceProperties(&prop, d);
	unsigned ntpb = prop.maxThreadsDim[0];

	int size = bitmap->GetSize();
	int width = bitmap->_width;
	int height = bitmap->_height;

	// Allocate GPU memory
	float* d_src;
	float * d_dst;
	int* d_xCoord;
	int* d_yCoord;
	cudaMalloc((void**)&d_src, size * sizeof(float));
	cudaMalloc((void**)&d_dst, size * sizeof(float));
	cudaMalloc((void**)&d_xCoord, size * sizeof(int));
	cudaMalloc((void**)&d_yCoord, size * sizeof(int));


	// Copy src to device src
	cudaMemcpy(d_src, bitmap->_pixels, size * sizeof(float), cudaMemcpyHostToDevice);

	// Prepare kernal launch
	// Calc how many blocks to launch
	int numOfBlocks = ((size + ntpb - 1) / ntpb);


	// Launch grid for pre-calculating XYCoords
	GenerateXYCoord << <numOfBlocks, ntpb >> >(d_xCoord, d_yCoord, width, height);

	// Launch grid for converting
	SDFGenerateCuda << <numOfBlocks, ntpb >> >(d_src, d_dst, d_xCoord, d_yCoord, size, 64);
	
	// Wait for kernel to finish before copying
	cudaDeviceSynchronize();

	// buffer array for SDF pixels
	float* dst = new float[bitmap->GetSize()];
	cudaMemcpy(dst, d_dst, size * sizeof(float), cudaMemcpyDeviceToHost);


	SaveBitmap(path, dst);

	Pause();

	// Free memory back
	cudaFree(d_src);
	cudaFree(d_dst);
	cudaFree(d_xCoord);
	cudaFree(d_yCoord);
	delete bitmap;
	delete dst;

	return 0;

}