Changes

Installation Wizards

3,841 bytes added, 14:44, 10 April 2017

→‎Assignment 3

=== Assignment 3 ===

For our optimizations to our parallel code we stuck to our plan and focused on reducing the amount of cudaMemcpy() used throughout each of the functions.

void Image::negateImage(Image& oldImage)

/*negates image*/

{

Image tempImage(N, M, Q);

int* d_temp = nullptr;

int* d_img = nullptr;

int size = N * M;

int nblocks = size / ntpb;

cudaMalloc((void**)&d_temp, size * sizeof(int));

cudaMalloc((void**)&d_img, size * sizeof(int));

cudaMemcpy(d_temp, tempImage.pixelVal, size * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_img, pixelVal, size * sizeof(int), cudaMemcpyHostToDevice);

negate << <nblocks, ntpb >> >(d_temp, d_img, size);

cudaError_t err = cudaGetLastError();

if (err != cudaSuccess)

printf("Error: %s\n", cudaGetErrorString(err));

cudaDeviceSynchronize();

cudaMemcpy(tempImage.pixelVal, d_temp, size * sizeof(int), cudaMemcpyDeviceToHost);

cudaFree(d_temp);

cudaFree(d_img);

oldImage = tempImage;

}

void Image::reflectImage(bool flag, Image& oldImage)

/*Reflects the Image based on users input*/

{

int rows = oldImage.N;

int cols = oldImage.M;

Image tempImage(oldImage);

int* d_temp = nullptr;

int* d_img = nullptr;

int size = rows * cols;

int nblocks = size / ntpb;

cudaMalloc((void**)&d_temp, size * sizeof(int));

cudaMalloc((void**)&d_img, size * sizeof(int));

cudaMemcpy(d_temp, tempImage.pixelVal, size * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_img, oldImage.pixelVal, size * sizeof(int), cudaMemcpyHostToDevice);

if (flag) {

horizontalReflect << <nblocks, ntpb >> >(d_temp, d_img, size, rows, cols);

}

else {

verticalReflect << <nblocks, ntpb >> >(d_temp, d_img, size, rows, cols);

}

cudaDeviceSynchronize();

cudaMemcpy(tempImage.pixelVal, d_temp, size * sizeof(int), cudaMemcpyDeviceToHost);

cudaFree(d_temp);

cudaFree(d_img);

oldImage = tempImage;

}

void Image::enlargeImage(int value, Image& oldImage)

/*enlarges Image and stores it in tempImage, resizes oldImage and stores the

larger image in oldImage*/

{

int rows, cols, gray;

int pixel;

int enlargeRow, enlargeCol;

rows = oldImage.N * value;

cols = oldImage.M * value;

gray = oldImage.Q;

Image tempImage(rows, cols, gray);

int r = oldImage.N;

int c = oldImage.M;

int* d_temp = nullptr;

int* d_img = nullptr;

int size = rows * cols;

int nblocks = size / ntpb;

cudaMalloc((void**)&d_temp, size * sizeof(int));

cudaMalloc((void**)&d_img, size * sizeof(int));

cudaMemcpy(d_temp, tempImage.pixelVal, size * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_img, oldImage.pixelVal, (r * c) * sizeof(int), cudaMemcpyHostToDevice);

enlarge << <nblocks, ntpb >> >(d_temp, d_img, size, value, c, cols);

cudaDeviceSynchronize();

//set the image's data

cudaMemcpy(tempImage.pixelVal, d_temp, size * sizeof(int), cudaMemcpyDeviceToHost);

//free device mem

cudaFree(d_temp);

cudaFree(d_img);

oldImage = tempImage;

}

As seen above, for all of the functions we converted to parallel, there is a lot of copying to and from the device. The use of a temp image makes this task even more intensive as we are basically copying the image to and from the device twice. In order to remove these cudaMemcpy() calls from each of the functions, we decided to simply create the image we are going to process directly onto the device allowing our parallel functions to operate on the pixels of the image without having to copy both the image and and a temporary image to and from the device. With this solution we were able to effectively reduce our 6 cudaMemcpy() calls from the parallel code into 2 cudaMemcpy() calls in our optimized code.

Kramsamujh

37

edits

Changes

Installation Wizards

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools