1
edit
Changes
→Assignment 3
This removed the edge checks from the convolution kernel allowing for a dramatic increase in Gaussian.
<pre>
//old
for (int i = -kXRad; i <= kXRad; i++){
for (int j = -kYRad; j <= kYRad; j++){
//wrap image
outPixel.Red += kernelElement * imageElement.Red;
outPixel.Green += kernelElement * imageElement.Green;
outPixel.Blue += kernelElement * imageElement.Blue;
}
}
</pre>
<pre>
//new
for (int i = -kXRad; i <= kXRad; i++) { for (int j = -kYRad; j <= kYRad; j++) { const float kernelElement = convolutionKernel[kernelIndex(i, j, kernelXSize, kernelYSize)]; const GPU_RGBApixel imageElement = img[imageIndex(xCorr + i, yCorr + j, height + kernelYSize - 1)]; outPixel.Red += kernelElement * imageElement.Red; outPixel.Green += kernelElement * imageElement.Green; outPixel.Blue += kernelElement * imageElement.Blue;
const float kernelElement = convolutionKernel[kernelIndex(i, j, kernelXSize, kernelYSize)];
const GPU_RGBApixel imageElement = img[imageIndex(xCorr + i, yCorr + j, height + kernelYSize - 1)];
outPixel.Red += kernelElement * imageElement.Red;
outPixel.Green += kernelElement * imageElement.Green;
outPixel.Blue += kernelElement * imageElement.Blue;
}
}
<b><font style="font-size:140%"> Canny </font></b>
notMaxSuppression used to have a large if block in the middle of the kernel that could potentially split into 4 threads. It was rewritten using a pre-computed table of outcomes and a little “mathemagic” to eliminate the if statements. It has reduced it reduces it from four paths to two.
<pre>
//changed the angle finding logic
//old
int index = imageIndex(x, y, height);
const float& gM = gradMagnitude[index];
result[index] = gM;
//don't touch the edges
if (x < 1 || y < 1 || x >= width - 1 || y >= height - 1) { return;
}
const float& gA = gradAngle[index];
int i;
int j;
result[index] = gradMagnitude[index];
if (((gA >= -M_PId8) && (gA <= M_PId8)) || (gA >= 7 * M_PId8) || (gA <= -7 * M_PId8)){ i = 0; j = -1;}else if (((gA <= 3 * M_PId8) && (gA > M_PId8)) || ((gA <= -5 * M_PId8) && (gA > -7 * M_PId8))){ i = -1; j = 1;}else if (((gA >= 5 * M_PId8) && (gA < 7 * M_PId8)) || ((gA < -M_PId8) && (gA >= -3 * M_PId8))){ i = 0-1; j = -1;}else{j i = -1; j = 0;
}
}
//new
int xCorr = x + 1;
int yCorr = y + 1;
int heightCorr = height + 2;
int index = imageIndex(xCorr, yCorr, heightCorr);
const float gM = gradMagnitude[index];
const int sectorConvertX[9] = { 0, -1, -1, -1, -1, -1, -1, 0 , 0 };
const int sectorConvertY[9] = { -1, 1, 1, -1, -1, 0, 0, -1 , -1 };
const float gA = fabsf(gradAngle[imageIndex(x, y, height)]);
const int sector = int(gA / M_PI) * 8;
int i = sectorConvertX[sector];
int j = sectorConvertY[sector];
if (!((gM <= gradMagnitude[imageIndex(xCorr + i, yCorr + j, heightCorr)]) || (gM <= gradMagnitude[imageIndex(xCorr - 1, yCorr - j, heightCorr)]))){ result[index] = gM;}else{ result[index] = gM0.0;
}
<pre>
//old
if (pixel >= lowerThreshold && pixel < upperThreshold) { int iLower = (x == 0) ? 0 : -1; int iUpper = (x == width - 1) ? 0 : 1; int jLower = (height == 0) ? 0 : -1; int jUpper = (x == width - 1) ? 0 : 1; ret = 0.0; for (int i = iLower; i <= iUpper; i++) { for (int j = jLower; j <= jUpper; j++) { if (image[imageIndex(x + i, y + j, height)]) {
ret = upperThreshold + 10.0;
for (int i = iLower; i <= iUpper; i++){
for (int j = jLower; j <= jUpper; j++){
if (image[imageIndex(x + i, y + j, height)]){
ret = upperThreshold + 1;
}
}
}
}
//new
if (level >= lowerThreshold && level < upperThreshold){
<b><font style="font-size:140%"> Grey world </font></b>
Global memory calls were reduced from twelve to one, as well as reducing branching if statements.
<pre>
if(averageR > 0.05){
newGrey[ imageIndex(x, y, height) ].Red = (255 < Original[ imageIndex(x, y, height) ].Red * averageAll / averageR)? 255 : Original[ imageIndex(x, y, height) ].Red * averageAll / averageR; } else { newGrey[ imageIndex(x, y, height) ].Red = (255 < Original[ imageIndex(x, y, height) ].Red + averageAll)? 255 : Original[ imageIndex(x, y, height) ].Red + averageAll;
}
if(averageB > 0.05){
newGrey[ imageIndex(x, y, height) ].Blue = (255 < Original[ imageIndex(x, y, height) ].Blue * averageAll / averageB)? 255 : Original[ imageIndex(x, y, height) ].Blue * averageAll / averageB; } else { newGrey[ imageIndex(x, y, height) ].Blue = (255 < Original[ imageIndex(x, y, height) ].Blue + averageAll)? 255 : Original[ imageIndex(x, y, height) ].Blue + averageAll;
}
if(averageG > 0.05){
newGrey[ imageIndex(x, y, height) ].Green = (255 < Original[ imageIndex(x, y, height) ].Green * averageAll / averageG)? 255 : Original[ imageIndex(x, y, height) ].Green * averageAll / averageG; } else { newGrey[ imageIndex(x, y, height) ].Green = (255 < Original[ imageIndex(x, y, height) ].Green + averageAll)? 255 : Original[ imageIndex(x, y, height) ].Green + averageAll;
}
GPU_RGBApixel orig = Original[imageIndex(x, y, height)];
GPU_RGBApixel grey = { 0, 0, 0 };
if(averageR > 0.05) { grey.Red = orig.Red * averageAll / averageR;}else{ grey.Red = orig.Red + averageAll;
}
}
}
if(grey.Red > 255) grey.Red = 255;
if(grey.Blue > 255) grey.Blue = 255;
if(grey.Green > 255) grey.Green = 255;
newGrey[imageIndex(x, y, height)] = grey;
</pre>
<b><font style="font-size:140%"> Autocontrast </font></b>
The only optimization made was reducing global memory access from 2 to 1.
const RGBApixel& imgPixel = img[imageIndex(x, y, height)];
RGBApixel& resultPixel = result[imageIndex(x, y, height)];
int index = imageIndex(x, y, height);
const RGBApixel& imgPixel = img[index];
RGBApixel& resultPixel = result[index];
<b><font style="font-size:140%"> Resize </font></b>
Logic was tinkered with, and some redundant caculations were eliminated storing the reults in local registers. As well, global memory access was reduced from 12 times a thread to 4. Finally, the result from the pixel calculation was stored in a temp local register and then that was writted to global memory, reducing the writes to global memory.
<pre>
newImage[i * NewWidth + j].Red =
(ebmpBYTE)((1.0 - ThetaI - ThetaJ + ThetaI*ThetaJ)*(OldImage[I * OldWidth + J].Red) + (ThetaI - ThetaI*ThetaJ)*(OldImage[(I + 1) * OldWidth + J].Red) + (ThetaJ - ThetaI*ThetaJ)*(OldImage[I * OldWidth + J + 1].Red) + (ThetaI*ThetaJ)*(OldImage[(I + 1) * OldWidth + J + 1].Red));
newImage[i * NewWidth + j].Green =
(ebmpBYTE)((1.0 - ThetaI - ThetaJ + ThetaI*ThetaJ)*(OldImage[I * OldWidth + J].Green) + (ThetaI - ThetaI*ThetaJ)*(OldImage[(I + 1) * OldWidth + J].Green) + (ThetaJ - ThetaI*ThetaJ)*(OldImage[I * OldWidth + J + 1].Green) + (ThetaI*ThetaJ)*(OldImage[(I + 1) * OldWidth + J + 1].Green));
newImage[i * NewWidth + j].Blue =
(ebmpBYTE)((1.0 - ThetaI - ThetaJ + ThetaI*ThetaJ)*(OldImage[I * OldWidth + J].Blue)
+ (ThetaI - ThetaI*ThetaJ)*(OldImage[(I + 1) * OldWidth + J].Blue)
+ (ThetaJ - ThetaI*ThetaJ)*(OldImage[I * OldWidth + J + 1].Blue)
+ (ThetaI*ThetaJ)*(OldImage[(I + 1) * OldWidth + J + 1].Blue));
float t4 = ThetaI*ThetaJ;
float t1 = 1.0 - ThetaI - ThetaJ + t4;
float t2 = ThetaI - t4;
float t3 = ThetaJ - t4;
int p1 = I * OldWidth + J;
int p2 = (I + 1) * OldWidth + J;
int p3 = I * OldWidth + J + 1;
int p4 = (I + 1) * OldWidth + J + 1;
RGBApixel temp;
RGBApixel temp1 = OldImage[p1];
RGBApixel temp3 = OldImage[p3];
RGBApixel temp2 = OldImage[p2];
RGBApixel temp4 = OldImage[p4];
temp.Red =(ebmpBYTE)((t1)*(temp1.Red)+ (t2)*(temp2.Red) + (t3)*(temp3.Red) + (t4)*(temp4.Red));temp.Green = (ebmpBYTE)((t1)*(temp1.Green) + (t2)*(temp2.Green) + (t3)*(temp3.Green) + (t4)*(temp4.Green));temp.Blue = (ebmpBYTE)((t1)*(temp1.Blue) + (t2)*(temp2.Blue) + (t3)*(temp3.Blue) + (t4)*(temp4.Blue));
During the optimization of resize, an attempt was made to make use of share memory. While eventually it was made to work, there was a 250% performance decrease. Here was the last working attempt at shared memory use. This method introduced branching in order to check for edge detection of the blocks. This method caused severe bank conflicts, and after further analysis revealed that it would not work due to each resultant pixels calculations were independent of every other pixel thus shared memory use was not possible.
<pre>__global__ void c_newPixel(RGBApixel * OldImage, RGBApixel *newImage, int OldWidth, int OldHeight, int NewWidth, int NewHeight) { int j = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.y * blockDim.y + threadIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; if (i >= NewHeight - 1 || j >= NewWidth - 1) { return; } int I, J; float ThetaI, ThetaJ; ThetaJ = (float)(j*(OldHeight - 1.0)) / (float)(NewHeight - 1.0); J = (int)floor(ThetaJ); ThetaJ -= J; ThetaI = (float)(i*(OldWidth - 1.0)) / (float)(NewWidth - 1.0); I = (int)floor(ThetaI); ThetaI -= I; int blkArraySize = (blockDim.x + 1) * (blockDim.x + 1); int blkArrayWidth = (blockDim.x + 1) * ty; extern __shared__ RGBApixel pixel[]; pixel[tx + blkArrayWidth] = OldImage[I * OldWidth + J];
pixel[tx + blkArrayWidth] = OldImage[I * OldWidth + J]; pixel[tx + blkArrayWidth + blkArraySize] = OldImage[(I + 1) * OldWidth + J]; if (ty == blockDim.y - 1){ int I1; float ThetaI1; ThetaI1 = (float)((i + 1)*(OldWidth - 1.0)) / (float)(NewWidth - 1.0); I1 = (int)floor(ThetaI1); int J1; float ThetaJ1; ThetaJ1 = (float)((j)*(OldHeight - 1.0)) / (float)(NewHeight - 1.0); J1 = (int)floor(ThetaJ1); pixel[tx + blockDim.x + blkArrayWidth] = OldImage[I1 * OldWidth + J1]; pixel[tx + blockDim.x + blkArrayWidth + blkArraySize] = OldImage[(I1 + 1) * OldWidth + J1]; }
int I1; float ThetaI1; ThetaI1 = (float)((i)*(OldWidth - 1.0)) / (float)(NewWidth - 1.0); I1 = (int)floor(ThetaI1);
pixelResult.Red =(ebmpBYTE)((t1)*(pixel[tx ps1].Red) + blockDim(t2)*(pixel[ps2].x Red) + blkArrayWidth (t3)*(pixel[ps3].Red) + blkArraySize] = OldImage[(I1 + 1t4) * OldWidth + J1(pixel[ps4].Red));
pixelResult.Blue = (ebmpBYTE)((t1)*(pixel[ps1].Blue)
+ (t2)*(pixel[ps2].Blue)
+ (t3)*(pixel[ps3].Blue)
+ (t4)*(pixel[ps4].Blue));
newImage[i * NewWidth + j] = pixelResult;
}
</pre>