37
edits
Changes
→Assignment 3
=== Assignment 3 ===
For our optimizations to our parallel code we stuck to our plan and focused on reducing the amount of cudaMemcpy() used throughout each of the functions.
void Image::negateImage(Image& oldImage)
/*negates image*/
{
Image tempImage(N, M, Q);
int* d_temp = nullptr;
int* d_img = nullptr;
int size = N * M;
int nblocks = size / ntpb;
cudaMalloc((void**)&d_temp, size * sizeof(int));
cudaMalloc((void**)&d_img, size * sizeof(int));
cudaMemcpy(d_temp, tempImage.pixelVal, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_img, pixelVal, size * sizeof(int), cudaMemcpyHostToDevice);
negate << <nblocks, ntpb >> >(d_temp, d_img, size);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
cudaDeviceSynchronize();
cudaMemcpy(tempImage.pixelVal, d_temp, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_temp);
cudaFree(d_img);
oldImage = tempImage;
}
void Image::reflectImage(bool flag, Image& oldImage)
/*Reflects the Image based on users input*/
{
int rows = oldImage.N;
int cols = oldImage.M;
Image tempImage(oldImage);
int* d_temp = nullptr;
int* d_img = nullptr;
int size = rows * cols;
int nblocks = size / ntpb;
cudaMalloc((void**)&d_temp, size * sizeof(int));
cudaMalloc((void**)&d_img, size * sizeof(int));
cudaMemcpy(d_temp, tempImage.pixelVal, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_img, oldImage.pixelVal, size * sizeof(int), cudaMemcpyHostToDevice);
if (flag) {
horizontalReflect << <nblocks, ntpb >> >(d_temp, d_img, size, rows, cols);
}
else {
verticalReflect << <nblocks, ntpb >> >(d_temp, d_img, size, rows, cols);
}
cudaDeviceSynchronize();
cudaMemcpy(tempImage.pixelVal, d_temp, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_temp);
cudaFree(d_img);
oldImage = tempImage;
}
void Image::enlargeImage(int value, Image& oldImage)
/*enlarges Image and stores it in tempImage, resizes oldImage and stores the
larger image in oldImage*/
{
int rows, cols, gray;
int pixel;
int enlargeRow, enlargeCol;
rows = oldImage.N * value;
cols = oldImage.M * value;
gray = oldImage.Q;
Image tempImage(rows, cols, gray);
int r = oldImage.N;
int c = oldImage.M;
int* d_temp = nullptr;
int* d_img = nullptr;
int size = rows * cols;
int nblocks = size / ntpb;
cudaMalloc((void**)&d_temp, size * sizeof(int));
cudaMalloc((void**)&d_img, size * sizeof(int));
cudaMemcpy(d_temp, tempImage.pixelVal, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_img, oldImage.pixelVal, (r * c) * sizeof(int), cudaMemcpyHostToDevice);
enlarge << <nblocks, ntpb >> >(d_temp, d_img, size, value, c, cols);
cudaDeviceSynchronize();
//set the image's data
cudaMemcpy(tempImage.pixelVal, d_temp, size * sizeof(int), cudaMemcpyDeviceToHost);
//free device mem
cudaFree(d_temp);
cudaFree(d_img);
oldImage = tempImage;
}
As seen above, for all of the functions we converted to parallel, there is a lot of copying to and from the device. The use of a temp image makes this task even more intensive as we are basically copying the image to and from the device twice. In order to remove these cudaMemcpy() calls from each of the functions, we decided to simply create the image we are going to process directly onto the device allowing our parallel functions to operate on the pixels of the image without having to copy both the image and and a temporary image to and from the device. With this solution we were able to effectively reduce our 6 cudaMemcpy() calls from the parallel code into 2 cudaMemcpy() calls in our optimized code.