1
edit
Changes
→Assignment 3
We tried to use shared memory, however our array size was simply too large.
In the end, after countless and tiring changes to the code, we settled on this simple configuration as being the fastest for our purpose of negating 1000 images.
Kernel:
<pre>
__global__ void cuda_negate_image(int * new_array, int * old_array, int n){
int tIdx = threadIdx.x;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n)
new_array[i] = -(old_array[i]) + 255;
}
</pre>
Method:
<pre>
void Image::negateImage(Image& oldImage)
/*negates image*/
{
int rows, cols, gray;
rows = N;
cols = M;
gray = Q;
//convert 2D array to 1D array
int * h_old_array = new int[N*M];
int * d_old_array;
int * d_new_array;
int k = 0;
cout << rows * cols << endl;
//copy array to 1D array (Row Major)
for(int i = 0; i < N; i++){
for(int j = 0; j < M; j++){
h_old_array[k] = oldImage.pixelVal[i][j];
k++;
}
}
int nblks = (N*M + ntpb - 1) / ntpb;
cout << rows * cols;
cudaMalloc((void**)&d_old_array, rows * cols* sizeof(int));
cudaMalloc((void**)&d_new_array, rows * cols* sizeof(int));
cudaMemcpy(d_old_array, h_old_array, rows * cols* sizeof(int), cudaMemcpyHostToDevice);
int n = rows * cols;
int start_s=clock();
for(int i = 0; i < 1000; i++){
cuda_negate_image<<<nblks, ntpb>>>(d_new_array, d_old_array, n);
}
int stop_s=clock();
cout << "time: " << (stop_s-start_s)/double(CLOCKS_PER_SEC)*1000 << endl;
int * result = new int[N*M];
cudaMemcpy(result, d_new_array, rows * cols *sizeof(int), cudaMemcpyDeviceToHost);
k = 0;
//copy array from 1D to 2D array
for(int i = 0; i < N; i++){
for(int j = 0; j < M; j++){
pixelVal[i][j] = result[k];
k++;
}
}
cudaFree(d_old_array);
cudaFree(d_new_array);
}
</pre>