1
edit
Changes
→Code
I'm only going to post the kernels that changed. There were also many changes in the host code that helped make the program faster.
<big><pre>
// part of common.h
const int BLOCK_SIZE_X = 16;
const int BLOCK_SIZE_Y = 8;
const int BLOCK_SIZE_SSAA = 256;
const int BLOCK_SIZE_RGB = 16;
const int MAX_GRID_SIZE_X = 65536;
const uint8_t MAX_ALIASING_FACTOR = 16;
</pre></big>
<big><pre>
// part of main.cpp
cudaError_t error;
int iDevice;
cudaDeviceProp prop;
// Get device information for total global memory
error = cudaGetDevice(&iDevice);
if(error != cudaSuccess)
error = cudaGetDeviceProperties(&prop, iDevice);
if(error != cudaSuccess)
// The max amount to do per pass demends on the size of GPU memory and the size of unsigned integer.
// Global memory is devided by two so that both the value array and RGB array can both fit in memory.
DimensionSqType maxPixelsPerPass = (UINT_MAX > (prop.totalGlobalMem / 2)) ? (prop.totalGlobalMem / 2) : UINT_MAX;
// RGB + alpha is 4 BYTEs. Make sure two copies of the larger can fit in device memory.
DimensionType largerType = ((4 * sizeof(BYTE)) < sizeof(ElementType)) ? sizeof(ElementType) : (4 * sizeof(BYTE));
// Divide by two for extra safty.
maxPixelsPerPass /= (largerType * 2);
</pre></big>
<big><pre>
__device__ ElementType mandelbrot(ElementType c_i, ElementType c_r, IterationType iterations)
{
ElementType z_r = c_r;
ElementType z_i = c_i;
ElementType z2_r = z_r * z_r;
ElementType z2_i = z_i * z_i;
IterationType n = 0;
while(n < iterations && z2_r + z2_i < 4.0)
{
z_i = 2.0 * z_r * z_i + c_i;
z_r = z2_r - z2_i + c_r;
z2_r = z_r * z_r;
z2_i = z_i * z_i;
n++;
}
z_i = 2.0 * z_r * z_i + c_i;
z_r = z2_r - z2_i + c_r;
z2_r = z_r * z_r;
z2_i = z_i * z_i;
z_i = 2.0 * z_r * z_i + c_i;
z_r = z2_r - z2_i + c_r;
z2_r = z_r * z_r;
z2_i = z_i * z_i;
n += 2;
if(n > iterations)
{
}
else
{
return (ElementType)n + 1.0 - __logf(__logf(__dsqrt_rn(z2_r + z2_i)))/__logf(2.0);;
}
}
</pre></big>
// Return number of iterations.
__global__ void getFractal(ElementType* img, ElementType yMax, ElementType xMin, ElementType xScale, ElementType yScale, IterationType iterations, DimensionType width, DimensionType height)
{
DimensionType dx = blockIdx.x * BLOCK_SIZE_X + threadIdx.x;
DimensionType dy = blockIdx.y * BLOCK_SIZE_Y + threadIdx.y;
if(dx >= width || dy >= height)
img[(DimensionSqType)dy * (DimensionSqType)width + (DimensionSqType)dx] = mandelbrot(yMax - (ElementType)dy * yScale,
xMin + (ElementType)dx * xScale,
iterations);
}
</pre></big>
// Return number of iterations.
__global__ void getFractalSSAA(ElementType* img, DimensionSqType* list, DimensionSqType length, ElementType yMax, ElementType xMin,
ElementType xScale, ElementType yScale, IterationType iterations,
DimensionType width, AlisingFactorType ssaafactor)
{
DimensionType curr = blockIdx.x * BLOCK_SIZE_SSAA + threadIdx.x;
if(curr >= length)
DimensionSqType val = list[curr];
ElementType xSubScale = xScale / ((ElementType)ssaafactor);
ElementType ySubScale = yScale / ((ElementType)ssaafactor);
// Get the centre of the top left subpixel
xMin = xMin + (ElementType)(val % width) * xScale - (xScale / 2.0) + (xSubScale / 2.0);
yMax = yMax - (ElementType)(val / width) * yScale + (yScale / 2.0) - (ySubScale / 2.0);
// Get the values for each pixel in fractal
ElementType subpixels[MAX_ALIASING_FACTOR * MAX_ALIASING_FACTOR];
for(AlisingFactorType x = 0; x < ssaafactor; x++)
{
for(AlisingFactorType y = 0; y < ssaafactor; y++)
{
subpixels[x * ssaafactor + y] = mandelbrot(yMax - ySubScale * y , xMin + xSubScale * x, iterations);
}
}
AlisingFactorSqType factor2 = (AlisingFactorSqType)ssaafactor * (AlisingFactorSqType)ssaafactor;
if(factor2 % 2 != 0)
{
img[val] = getMedian(subpixels, (AlisingFactorSqType)ssaafactor * (AlisingFactorSqType)ssaafactor / 2, factor2);
}
else
{
img[val] = (getMedian(subpixels, factor2 / 2 - 1, factor2)
+ getMedian(subpixels, factor2 / 2, factor2))
/ 2.0;
}
}
</pre></big>
= Assignment 2 =