Open main menu

CDOT Wiki β

Changes

The B-Team

7,500 bytes added, 18:17, 30 November 2012
no edit summary
# [mailto:kmbarnhart@learn.senecac.on.ca?sujbect=DPS915 Kyle Barnhart]
 
 
 
= Assignment 3 =
== Mandelbrot Set ==
=== Repo ===
https://github.com/KyleBarnhart/Fractal/tree/cuda
 
=== Description ===
 
My Mandelbrot Set program is not the type of program that the reduction of an array to a value, nor does the computation of values require knowledge of other values in an array. In fact the only time global memory is used in the Mandelbrot function is when the value is set. The input is derived from a number of constants and the position of the thread in the grid and block. There are a number of constraints but the most prominent is the size of global device memory and the size of an unsigned integer. To make very large images, very large arrays of values are needed. I enhanced a calculation to determine the largest possible size. The number of values is divided by that size and the image is built over a number of passes.
 
 
 
In actual fact the main Mandelbrot calculation was already as efficient as possible. It is a simple function and there simply were no changes to make. The initial CUDA version of the program saw and average improvement of 737.6% over the original CPU version of the program. The optimized version was 802.9% faster than the CPU version, which is 10.4% better than the original CUDA version.
 
 
 
=== Code ===
 
I'm only going to post the kernels that changed. There were also many changes in the host code that helped make the program faster.
<big><pre>
 
// part of common.h
const int BLOCK_SIZE_X = 16;
 
const int BLOCK_SIZE_Y = 8;
 
const int BLOCK_SIZE_SSAA = 256;
 
const int BLOCK_SIZE_RGB = 16;
 
 
 
 
const int MAX_GRID_SIZE_X = 65536;
 
 
 
 
const uint8_t MAX_ALIASING_FACTOR = 16;
</pre></big>
<big><pre>
 
// part of main.cpp
 
cudaError_t error;
 
int iDevice;
 
cudaDeviceProp prop;
 
 
 
// Get device information for total global memory
 
error = cudaGetDevice(&iDevice);
 
if(error != cudaSuccess)
 
displayCudeError(error);
 
 
 
error = cudaGetDeviceProperties(&prop, iDevice);
 
if(error != cudaSuccess)
 
displayCudeError(error);
 
 
 
// The max amount to do per pass demends on the size of GPU memory and the size of unsigned integer.
 
// Global memory is devided by two so that both the value array and RGB array can both fit in memory.
 
DimensionSqType maxPixelsPerPass = (UINT_MAX > (prop.totalGlobalMem / 2)) ? (prop.totalGlobalMem / 2) : UINT_MAX;
 
 
 
// RGB + alpha is 4 BYTEs. Make sure two copies of the larger can fit in device memory.
 
DimensionType largerType = ((4 * sizeof(BYTE)) < sizeof(ElementType)) ? sizeof(ElementType) : (4 * sizeof(BYTE));
 
 
 
// Divide by two for extra safty.
 
maxPixelsPerPass /= (largerType * 2);
 
</pre></big>
 
 
 
<big><pre>
__device__ ElementType mandelbrot(ElementType c_i, ElementType c_r, IterationType iterations)
 
{
 
ElementType z_r = c_r;
 
ElementType z_i = c_i;
 
 
ElementType z2_r = z_r * z_r;
 
ElementType z2_i = z_i * z_i;
 
 
IterationType n = 0;
 
 
while(n < iterations && z2_r + z2_i < 4.0)
 
{
 
z_i = 2.0 * z_r * z_i + c_i;
 
z_r = z2_r - z2_i + c_r;
 
 
z2_r = z_r * z_r;
 
z2_i = z_i * z_i;
 
 
n++;
 
}
 
 
 
z_i = 2.0 * z_r * z_i + c_i;
 
z_r = z2_r - z2_i + c_r;
 
 
z2_r = z_r * z_r;
 
z2_i = z_i * z_i;
 
 
 
z_i = 2.0 * z_r * z_i + c_i;
 
z_r = z2_r - z2_i + c_r;
 
 
z2_r = z_r * z_r;
 
z2_i = z_i * z_i;
 
 
n += 2;
 
 
 
if(n > iterations)
 
{
 
return (ElementType)iterations;
 
}
 
else
 
{
 
return (ElementType)n + 1.0 - __logf(__logf(__dsqrt_rn(z2_r + z2_i)))/__logf(2.0);;
 
}
 
}
</pre></big>
<big><pre>
// Calculate if c is in Mandelbrot set.
// Return number of iterations.
__global__ void getFractal(ElementType* img, ElementType yMax, ElementType xMin, ElementType xScale, ElementType yScale, IterationType iterations, DimensionType width, DimensionType height)
 
{
 
DimensionType dx = blockIdx.x * BLOCK_SIZE_X + threadIdx.x;
 
DimensionType dy = blockIdx.y * BLOCK_SIZE_Y + threadIdx.y;
 
 
 
if(dx >= width || dy >= height)
 
return;
 
 
// This is fine because so few registers are used
 
img[(DimensionSqType)dy * (DimensionSqType)width + (DimensionSqType)dx] = mandelbrot(yMax - (ElementType)dy * yScale,
 
xMin + (ElementType)dx * xScale,
 
iterations);
 
}
</pre></big>
<big><pre>
// Calculate if c is in Mandelbrot set.
// Return number of iterations.
__global__ void getFractalSSAA(ElementType* img, DimensionSqType* list, DimensionSqType length, ElementType yMax, ElementType xMin,
 
ElementType xScale, ElementType yScale, IterationType iterations,
 
DimensionType width, AlisingFactorType ssaafactor)
 
{
 
DimensionType curr = blockIdx.x * BLOCK_SIZE_SSAA + threadIdx.x;
 
 
 
if(curr >= length)
 
return;
 
 
 
DimensionSqType val = list[curr];
 
 
ElementType xSubScale = xScale / ((ElementType)ssaafactor);
 
ElementType ySubScale = yScale / ((ElementType)ssaafactor);
 
 
 
// Get the centre of the top left subpixel
 
xMin = xMin + (ElementType)(val % width) * xScale - (xScale / 2.0) + (xSubScale / 2.0);
 
yMax = yMax - (ElementType)(val / width) * yScale + (yScale / 2.0) - (ySubScale / 2.0);
 
 
// Get the values for each pixel in fractal
 
ElementType subpixels[MAX_ALIASING_FACTOR * MAX_ALIASING_FACTOR];
 
 
for(AlisingFactorType x = 0; x < ssaafactor; x++)
 
{
 
for(AlisingFactorType y = 0; y < ssaafactor; y++)
 
{
 
subpixels[x * ssaafactor + y] = mandelbrot(yMax - ySubScale * y , xMin + xSubScale * x, iterations);
 
}
 
}
 
 
 
AlisingFactorSqType factor2 = (AlisingFactorSqType)ssaafactor * (AlisingFactorSqType)ssaafactor;
 
 
 
if(factor2 % 2 != 0)
 
{
 
img[val] = getMedian(subpixels, (AlisingFactorSqType)ssaafactor * (AlisingFactorSqType)ssaafactor / 2, factor2);
 
}
 
else
 
{
 
img[val] = (getMedian(subpixels, factor2 / 2 - 1, factor2)
 
+ getMedian(subpixels, factor2 / 2, factor2))
 
/ 2.0;
 
}
 
}
</pre></big>
<big><pre>
__global__ void getBmpRGB(BYTE* image, ElementType* values, DimensionType width, DimensionType height, IterationType iterations)
{
DimensionType dy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
DimensionType dx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
if(dx >= width || dy >= height)
return;
 
DimensionType c = dy * width + dx;
BYTE rgbValue[3];
 
getRGB(values[c]/(ElementType)iterations, rgbValue);
image[c*3] = rgbValue[2];
image[c*3 + 1] = rgbValue[1];
image[c*3 + 2] = rgbValue[0];
}
</pre></big>
<big><pre>
__global__ void getBmpRGBfromHistorgram(ElementType* map, BYTE* image, ElementType* values, DimensionType width, DimensionType height)
{
DimensionType dy = blockIdx.y * BLOCK_SIZE + threadIdx.y;
DimensionType dx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
if(dx >= width || dy >= height)
return;
 
DimensionType c = dy * width + dx;
 
IterationType ival = (IterationType)values[c];
 
ElementType colourVal = map[ival] + (values[c] - (ElementType)ival) * (map[ival + 1] - map[ival]);
BYTE rgbValue[3];
 
getRGB(colourVal, rgbValue);
image[c*3] = rgbValue[2];
image[c*3 + 1] = rgbValue[1];
image[c*3 + 2] = rgbValue[0];
}
</pre></big>
1
edit