93
edits
Changes
→Assignment 2
=== Assignment 2 ===
Original CPU Implementation:
void Image::rotateImage(int theta, Image& oldImage)
/*based on users input and rotates it around the center of the image.*/
{
int r0, c0;
int r1, c1;
int rows, cols;
rows = oldImage.N;
cols = oldImage.M;
Image tempImage(rows, cols, oldImage.Q);
float rads = (theta * 3.14159265)/180.0;
r0 = rows / 2;
c0 = cols / 2;
for(int r = 0; r < rows; r++)
{
for(int c = 0; c < cols; c++)
{
r1 = (int) (r0 + ((r - r0) * cos(rads)) - ((c - c0) * sin(rads)));
c1 = (int) (c0 + ((r - r0) * sin(rads)) + ((c - c0) * cos(rads)));
if(inBounds(r1,c1))
{
tempImage.pixelVal[r1][c1] = oldImage.pixelVal[r][c];
}
}
}
for(int i = 0; i < rows; i++)
{
for(int j = 0; j < cols; j++)
{
if(tempImage.pixelVal[i][j] == 0)
tempImage.pixelVal[i][j] = tempImage.pixelVal[i][j+1];
}
}
oldImage = tempImage;
}
Parallelized Code:
Kernels
__device__ bool inBounds(int row, int col, int maxRow, int maxCol) {
if (row >= maxRow || row < 0 || col >= maxCol || col < 0)
return false;
//else
return true;
}
__global__ void rotateKernel(int* oldImage, int* newImage, int rows, int cols, float rads) {
int r = blockIdx.x * blockDim.x + threadIdx.x;
int c = blockIdx.y * blockDim.y + threadIdx.y;
int r0 = rows / 2;
int c0 = cols / 2;
float sinRads = sinf(rads);
float cosRads = cosf(rads);
/*__shared__ int s[ntpb * ntpb];
s[r * cols + c] = oldImage[r * cols + c];*/
if (r < rows && c < cols)
{
int r1 = (int)(r0 + ((r - r0) * cosRads) - ((c - c0) * sinRads));
int c1 = (int)(c0 + ((r - r0) * sinRads) + ((c - c0) * cosRads));
if (inBounds(r1, c1, rows, cols))
{
newImage[r1 * cols + c1] = oldImage[r * cols + c];
}
}
}
Modified Function
void Image::rotateImage(int theta, Image& oldImage)
/*based on users input and rotates it around the center of the image.*/
{
int r0, c0;
int r1, c1;
int rows, cols;
rows = oldImage.N;
cols = oldImage.M;
Image tempImage(rows, cols, oldImage.Q);
float rads = (theta * 3.14159265)/180.0;
// workspace start
// - calculate number of blocks for n rows assume square image
int nb = (rows + ntpb - 1) / ntpb;
// allocate memory for matrices d_a, d_b on the device
// - add your allocation code here
int* d_a;
check("device a", cudaMalloc((void**)&d_a, rows* cols * sizeof(int)));
int* d_b;
check("device b", cudaMalloc((void**)&d_b, rows* cols * sizeof(int)));
// copy h_a and h_b to d_a and d_b (host to device)
// - add your copy code here
check("copy to d_a", cudaMemcpy(d_a, oldImage.pixelVal, rows * cols * sizeof(int), cudaMemcpyHostToDevice));
//check("copy to d_b", cudaMemcpy(d_b, tempImage.pixelVal, rows * cols * sizeof(int), cudaMemcpyHostToDevice));
// launch execution configuration
// - define your 2D grid of blocks
dim3 dGrid(nb, nb);
// - define your 2D block of threads
dim3 dBlock(ntpb, ntpb);
// - launch your execution configuration
rotateKernel<<<dGrid, dBlock >>>(d_a, d_b, rows, cols, rads);
check("launch error: ", cudaPeekAtLastError());
// - check for launch termination
// synchronize the device and the host
check("Synchronize ", cudaDeviceSynchronize());
// copy d_b to tempImage (device to host)
// - add your copy code here
check("device copy to hc", cudaMemcpy(tempImage.pixelVal, d_b, rows * cols * sizeof(int), cudaMemcpyDeviceToHost));
// deallocate device memory
// - add your deallocation code here
cudaFree(d_a);
cudaFree(d_b);
// reset the device
cudaDeviceReset();
// workspace end
for(int i = 0; i < rows; i++)
{
for(int j = 0; j < cols; j++)
{
if(tempImage.pixelVal[i * M + j] == 0)
tempImage.pixelVal[i * M + j] = tempImage.pixelVal[i * M + j+1];
}
}
oldImage = tempImage;
}
{|
|Function
|CPU-Only
|GPU-CPU
|speedup(%)
|-
|function1
|s
|s
|%
|-
|function2
|s
|s
|%
|}
=== Assignment 3 ===