Changes

Algo holics

3,839 bytes added, 01:56, 8 April 2019

→‎Assignment 3

For optimizing the code better, we thought of removing the iterative loop from the kernel by using threadIdx.y to control calculation of each element's cosine for that position in the supposed matrix. The problem in this was that each thread was in a racing condition to write to the same memory location, to sum up the cosine transformations for all elements of that row. We solved this by using the atomic function. Its prototype is as follows.

double atomicAdd(double* address, double value)

=====Kernel 2=====

{| class="wikitable mw-collapsible mw-collapsed"

! Kernel 2

|-

|

# include <cmath>

# include <cstdlib>

# include <iostream>

# include <iomanip>

# include <ctime>

# include <chrono>

# include <cstdlib>

# include <cmath>

#include <limits>

#include <cuda_runtime.h>

#include <cuda.h>

using namespace std;

using namespace std::chrono;

const double pi = 3.141592653589793;

const unsigned ntpb = 32;

void cosine_transform_test01 ( int size );

double *r8vec_uniform_01_new ( int n, int &seed ){

int i;

const int i4_huge = 2147483647;

int k;

double *r;

if ( seed == 0 ){

cerr << "\n";

cerr << "R8VEC_UNIFORM_01_NEW - Fatal error!\n";

cerr << " Input value of SEED = 0.\n";

exit ( 1 );

}

r = new double[n];

for ( i = 0; i < n; i++ ){

k = seed / 127773;

seed = 16807 * ( seed - k * 127773 ) - k * 2836;

if ( seed < 0 ){

seed = seed + i4_huge;

}

r[i] = ( double ) ( seed ) * 4.656612875E-10;

}

return r;

}

double *cosine_transform_data ( int n, double d[] ){

double angle;

double *c;

int i;

int j;

c = new double[n];

for ( i = 0; i < n; i++ ){

c[i] = 0.0;

for ( j = 0; j < n; j++ ){

angle = pi * ( double ) ( i * ( 2 * j + 1 ) ) / ( double ) ( 2 * n );

c[i] = c[i] + cos ( angle ) * d[j];

}

c[i] = c[i] * sqrt ( 2.0 / ( double ) ( n ) );

}

return c;

}

void reportTime(const char* msg, steady_clock::duration span) {

auto ms = duration_cast<milliseconds>(span);

std::cout << msg << " - took - " <<

ms.count() << " millisecs" << std::endl;

}

__global__ void cosTransformKernel(double *a, double *b, const int n){

double angle;

const double pi = 3.141592653589793;

int j = blockIdx.x * blockDim.x + threadIdx.x;

int i = blockIdx.y * blockDim.y + threadIdx.y;

if(i<n && j<n){

angle = pi * ( double ) ( i * ( 2 * j + 1 ) ) / ( double ) ( 2 * n );

double value = cos ( angle ) * a[j];

b[i] = atomicAdd(&b[i], value);

}

//square root of the whole cos transformed row term

if(j==n-1 && i<n){

b[i] *= sqrt ( 2.0 / ( double ) ( n ) );

}

int main (int argc, char* argv[] ){

if (argc != 2) {

std::cerr << argv[0] << ": invalid number of arguments\n";

std::cerr << "Usage: " << argv[0] << " size_of_vector\n";

return 1;

}

int n = std::atoi(argv[1]);

cosine_transform_test01 (n);

return 0;

}

void cosine_transform_test01 ( int size){

int n = size;

int seed;

double *r;

double *hs; //host side pointer to store the array returned from host side cosine_transform_data, for comparison purposes

double *s = new double[n];

//double *t;

double *d_a;

double *d_b;

//allocate memory on the device for the randomly generated array and for the array in which transform values will be stored

cudaMalloc((void**)&d_a,sizeof(double) * n);

cudaMalloc((void**)&d_b,sizeof(double) * n);

seed = 123456789;

r = r8vec_uniform_01_new ( n, seed );

//copy randomly generated values from host to device

for(int i=0; i<n; i++)

s[i]=0.0;

cudaMemcpy(d_a,r,sizeof(double)*n,cudaMemcpyHostToDevice);

cudaMemcpy(d_b,s,sizeof(double)*n,cudaMemcpyHostToDevice);

int nblks = (n + ntpb - 1) / ntpb;

dim3 grid(nblks,nblks,1);

dim3 block(ntpb,ntpb,1);

steady_clock::time_point ts, te;

ts = steady_clock::now();

cosTransformKernel<<<grid,block>>>(d_a,d_b,size);

cudaDeviceSynchronize();

te = steady_clock::now();

reportTime("Cosine Transform on device",te-ts);

cudaMemcpy(s,d_b,sizeof(double)*n,cudaMemcpyDeviceToHost);

ts = steady_clock::now();

hs = cosine_transform_data ( n, r );

te = steady_clock::now();

reportTime("Cosine Transform on host",te-ts);

cudaFree(d_a);

cudaFree(d_b);

delete [] r;

delete [] s;

delete [] hs;

//delete [] t;

return;

}

|}

Here is a comparison between the naive and optimized kernel

Ssdhillon20

57

edits

CDOT Wiki β

Changes

Algo holics

CDOT Wiki ^β