1
edit
Changes
→Assignment 3
</pre>
//old
Global memory calls were reduced from twelve to one, as well as reducing branching if statements.
<pre>
if(averageR > 0.05){
newGrey[ imageIndex(x, y, height) ].Red = (255 < Original[ imageIndex(x, y, height) ].Red * averageAll / averageR)? 255 : Original[ imageIndex(x, y, height) ].Red * averageAll / averageR; } else { newGrey[ imageIndex(x, y, height) ].Red = (255 < Original[ imageIndex(x, y, height) ].Red + averageAll)? 255 : Original[ imageIndex(x, y, height) ].Red + averageAll;
}
if(averageB > 0.05){
newGrey[ imageIndex(x, y, height) ].Blue = (255 < Original[ imageIndex(x, y, height) ].Blue * averageAll / averageB)? 255 : Original[ imageIndex(x, y, height) ].Blue * averageAll / averageB; } else { newGrey[ imageIndex(x, y, height) ].Blue = (255 < Original[ imageIndex(x, y, height) ].Blue + averageAll)? 255 : Original[ imageIndex(x, y, height) ].Blue + averageAll;
}
if(averageG > 0.05){
newGrey[ imageIndex(x, y, height) ].Green = (255 < Original[ imageIndex(x, y, height) ].Green * averageAll / averageG)? 255 : Original[ imageIndex(x, y, height) ].Green * averageAll / averageG; } else { newGrey[ imageIndex(x, y, height) ].Green = (255 < Original[ imageIndex(x, y, height) ].Green + averageAll)? 255 : Original[ imageIndex(x, y, height) ].Green + averageAll;
}
GPU_RGBApixel orig = Original[imageIndex(x, y, height)];
GPU_RGBApixel grey = { 0, 0, 0 };
if(averageR > 0.05) { grey.Red = orig.Red * averageAll / averageR;}else{ grey.Red = orig.Red + averageAll;
}
}
}
if(grey.Red > 255) grey.Red = 255;
if(grey.Blue > 255) grey.Blue = 255;
if(grey.Green > 255) grey.Green = 255;
newGrey[imageIndex(x, y, height)] = grey;
</pre>
<b><font style="font-size:140%"> Autocontrast </font></b>
The only optimization made was reducing global memory access from 2 to 1.
const RGBApixel& imgPixel = img[imageIndex(x, y, height)];
RGBApixel& resultPixel = result[imageIndex(x, y, height)];
int index = imageIndex(x, y, height);
const RGBApixel& imgPixel = img[index];
RGBApixel& resultPixel = result[index];
<b><font style="font-size:140%"> Resize </font></b>
Logic was tinkered with, and some redundant caculations were eliminated storing the reults in local registers. As well, global memory access was reduced from 12 times a thread to 4. Finally, the result from the pixel calculation was stored in a temp local register and then that was writted to global memory, reducing the writes to global memory.
<pre>
newImage[i * NewWidth + j].Red =
(ebmpBYTE)((1.0 - ThetaI - ThetaJ + ThetaI*ThetaJ)*(OldImage[I * OldWidth + J].Red) + (ThetaI - ThetaI*ThetaJ)*(OldImage[(I + 1) * OldWidth + J].Red) + (ThetaJ - ThetaI*ThetaJ)*(OldImage[I * OldWidth + J + 1].Red) + (ThetaI*ThetaJ)*(OldImage[(I + 1) * OldWidth + J + 1].Red));
newImage[i * NewWidth + j].Green =
(ebmpBYTE)((1.0 - ThetaI - ThetaJ + ThetaI*ThetaJ)*(OldImage[I * OldWidth + J].Green) + (ThetaI - ThetaI*ThetaJ)*(OldImage[(I + 1) * OldWidth + J].Green) + (ThetaJ - ThetaI*ThetaJ)*(OldImage[I * OldWidth + J + 1].Green) + (ThetaI*ThetaJ)*(OldImage[(I + 1) * OldWidth + J + 1].Green));
newImage[i * NewWidth + j].Blue =
(ebmpBYTE)((1.0 - ThetaI - ThetaJ + ThetaI*ThetaJ)*(OldImage[I * OldWidth + J].Blue)
+ (ThetaI - ThetaI*ThetaJ)*(OldImage[(I + 1) * OldWidth + J].Blue)
+ (ThetaJ - ThetaI*ThetaJ)*(OldImage[I * OldWidth + J + 1].Blue)
+ (ThetaI*ThetaJ)*(OldImage[(I + 1) * OldWidth + J + 1].Blue));
float t4 = ThetaI*ThetaJ;
float t1 = 1.0 - ThetaI - ThetaJ + t4;
float t2 = ThetaI - t4;
float t3 = ThetaJ - t4;
int p1 = I * OldWidth + J;
int p2 = (I + 1) * OldWidth + J;
int p3 = I * OldWidth + J + 1;
int p4 = (I + 1) * OldWidth + J + 1;
RGBApixel temp;
RGBApixel temp1 = OldImage[p1];
RGBApixel temp3 = OldImage[p3];
RGBApixel temp2 = OldImage[p2];
RGBApixel temp4 = OldImage[p4];
temp.Red =(ebmpBYTE)((t1)*(temp1.Red)+ (t2)*(temp2.Red) + (t3)*(temp3.Red) + (t4)*(temp4.Red));temp.Green = (ebmpBYTE)((t1)*(temp1.Green) + (t2)*(temp2.Green) + (t3)*(temp3.Green) + (t4)*(temp4.Green));temp.Blue = (ebmpBYTE)((t1)*(temp1.Blue) + (t2)*(temp2.Blue) + (t3)*(temp3.Blue) + (t4)*(temp4.Blue));
During the optimization of resize, an attempt was made to make use of share memory. While eventually it was made to work, there was a 250% performance decrease. Here was the last working attempt at shared memory use. This method introduced branching in order to check for edge detection of the blocks. This method caused severe bank conflicts, and after further analysis revealed that it would not work due to each resultant pixels calculations were independent of every other pixel thus shared memory use was not possible.
<pre>__global__ void c_newPixel(RGBApixel * OldImage, RGBApixel *newImage,int OldWidth, int OldHeight, int NewWidth, int NewHeight){
int OldWidth, j = blockIdx.x * blockDim.x + threadIdx.x; int OldHeight, i = blockIdx.y * blockDim.y + threadIdx.y; int NewWidth, tx = threadIdx.x; int NewHeight)ty = threadIdx.y;
if (i >= NewHeight - 1 || j >= NewWidth - 1){ return; }
int I, J; float ThetaI, ThetaJ; ThetaJ = (float)(j *(OldHeight - 1.0)) / (float)(NewHeight - 1.0); J = (int)floor(ThetaJ); ThetaJ -= J; ThetaI = (float)(i*(OldWidth - 1.0)) / (float)(NewWidth - 1.0); I = (int)floor(ThetaI); ThetaI -= I; int blkArraySize = blockIdx(blockDim.x + 1) * (blockDim.x + threadIdx1); int blkArrayWidth = (blockDim.x+ 1) * ty;
pixel[tx + blkArrayWidth] = OldImage[I * OldWidth + J]; pixel[tx + blkArrayWidth + blkArraySize] = OldImage[(I + 1) * OldWidth + J]; if (ty == blockDim.y - 1){ int I1; float ThetaI1; ThetaI1 = (float)((i + 1)*(OldWidth - 1.0)) / (float)(NewWidth - 1.0); I1 = (int)floor(ThetaI1); int J1; float ThetaJ1; ThetaJ1 = (float)((j)*(OldHeight - 1.0)) / (float)(NewHeight - 1.0); J1 = (int)floor(ThetaJ1); pixel[tx + blockDim.x + blkArrayWidth] = threadIdxOldImage[I1 * OldWidth + J1]; pixel[tx + blockDim.x+ blkArrayWidth + blkArraySize] = OldImage[(I1 + 1) * OldWidth + J1]; }
if (tx == blockDim.x - 1){ int ty J1; float ThetaJ1; ThetaJ1 = threadIdx(float)((j + 1)*(OldHeight - 1.0)) / (float)(NewHeight - 1.y0); J1 = (int)floor(ThetaJ1);
pixelResult.Blue = (ebmpBYTE)((t1)*(pixel[ps1].Blue)
+ (t2)*(pixel[ps2].Blue)
+ (t3)*(pixel[ps3].Blue)
+ (t4)*(pixel[ps4].Blue));
newImage[i * NewWidth + j] = pixelResult;
}
</pre>