57
edits
Changes
→Kernel Version 2
{| class="wikitable mw-collapsible mw-collapsed"
! Flat profileCall Graph
|-
|
{| class="wikitable mw-collapsible mw-collapsed"
! Flat profileCall Graph
|-
|
=== Assignment 2 ===
The formula used here is: | (√1/n) , if u=0; 0≤v≤n-1 C(u,v) ===CUDA enabled functions==== The main function was changed to perform the copying of data from host to device, launch the kernel, copy back results from the device to host and release all memory | (√2/n) * cos[((2*v+1)π*u)/2n], on host and device. if 1≤u≤n-1; 0≤v≤n-1
Where, u is the row index, v is the column index and n is the total number of elements in a row/column in the computational matrix. This [https://www.youtube.com/watch?v=tW3Hc0Wrgl0 Link] can be used for better understanding of the above formula. Here is the [https://people.sc.fsu.edu/~jburkardt/cpp_src/cosine_transform/cosine_transform.html source code] used. =====Profiling=====The flat profile for the above serial code looks like: {| class="wikitable mw-collapsible mw-collapsed"! Flat Profile|-| 1 2 3 cudaMalloc4 granularity: each sample hit covers 2 byte(s) for 0.68% of 1.47 seconds 5 6 index % time self children called name 7 <spontaneous> 8 [1] 100.0 0.00 1.47 main [1] 9 0.00 1.47 1/1 cosine_transform_test01(void**int)&d_A, [3] 10 ----------------------------------------------- 11 1.47 0.00 1/1 cosine_transform_test01(sizeint) [3] 12 [2] 100.0 1.47 0.00 1 cosine_transform_data(int, double* sizeof) [2] 13 ----------------------------------------------- 14 0.00 1.47 1/1 main [1] 15 [3] 100.0 0.00 1.47 1 cosine_transform_test01(longint)[3] 16 1.47 0.00 1/1 cosine_transform_data(int, double*);[2] cudaMalloc 17 0.00 0.00 1/1 r8vec_uniform_01_new(int, int&) [14] 18 0.00 0.00 1/1 reportTime(void*char const*, std::chrono::duration<long, std::ratio<1l, 1000000000l> >)[13] 19 0.00 0.00 1/1 std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<std::chrono::_V2::s teady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >(std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 10 00000000l> > > const&d_B, (sizestd::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > const&) * sizeof[21] 20 ----------------------------------------------- 21 0.00 0.00 1/3 std::chrono::duration<long, std::ratio<1l, 1000l> > std::chrono::__duration_cast_impl<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::ratio<1l, 1000000l>, long, true, false>: :__cast<long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&));[18] cudaMemcpy 22 0.00 0.00 2/3 std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<long, std::ratio<1l , 1000000000l>, long, std::ratio<1l, 1000000000l> >(d_Astd::chrono::duration<long, h_Astd::ratio<1l, size * sizeof(1000000000l> > const&, std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&)[22] 23 [10] 0.0 0.00 0.00 3 std::chrono::duration<long, cudaMemcpyHostToDevicestd::ratio<1l, 1000000000l> >::count();const [10] 24 ----------------------------------------------- cudaMalloc 25 0.00 0.00 2/2 std::common_type<std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >::type std::chrono::operator-<std::chrono::_V2::s teady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> >, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >((void**std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 10 00000000l> > > const&, std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > const&)&d_ntpb[21] 26 [11] 0.0 0.00 0.00 2 std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, sizeof1000000000l> > >::time_since_epoch(dim3)const [11] 27 ----------------------------------------------- 28 0.00 0.00 1/1 __libc_csu_init [28] 29 [12] 0.0 0.00 0.00 1 _GLOBAL__sub_I__Z20r8vec_uniform_01_newiRi [12] 30 0.00 0.00 1/1 __static_initialization_and_destruction_0(int, int);[15] 31 ----------------------------------------------- cudaMalloc 32 0.00 0.00 1/1 cosine_transform_test01(int) [3] 33 [13] 0.0 0.00 0.00 1 reportTime(void*char const*, std::chrono::duration<long, std::ratio<1l, 1000000000l> >)[13] 34 0.00 0.00 1/1 std::enable_if<std::chrono::__is_duration<std::chrono::duration<long, std::ratio<1l, 1000l> > >::value, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::duratio n_cast<std::chrono::duration<long, std::ratio<1l, 1000l> >, long, std::ratio<1l, 1000000000l> >(std::chrono::duration<long, std::ratio<1l, 1000000000l> > const&d_nbpg) [17] 35 0.00 0.00 1/1 std::chrono::duration<long, std::ratio<1l, sizeof1000l> >::count(dim3)const [16] 36 ----------------------------------------------- 37 0.00 0.00 1/1 cosine_transform_test01(int);[3] cudaMemcpy 38 [14] 0.0 0.00 0.00 1 r8vec_uniform_01_new(d_ntpbint, int&ntpb, sizeof) [14] 39 ----------------------------------------------- 40 0.00 0.00 1/1 _GLOBAL__sub_I__Z20r8vec_uniform_01_newiRi [12] 41 [15] 0.0 0.00 0.00 1 __static_initialization_and_destruction_0(dim3)int, cudaMemcpyHostToDeviceint);[15] 42 ----------------------------------------------- cudaMemcpy 43 0.00 0.00 1/1 reportTime(d_nbpgchar const*, &nbpgstd::chrono::duration<long, sizeof(dim3)std::ratio<1l, cudaMemcpyHostToDevice1000000000l> >);[13] 44 [16] 0.0 0.00 0.00 1 std::chrono::duration<long *d_actual = d_A;, std::ratio<1l, 1000l> >::count() const [16] long *d_swap = d_B; 45 ----------------------------------------------- long totalThreads = ntpb 46 0.x * nbpg00 0.x; float start_time = clock00 1/1 reportTime(char const*, std::chrono::duration<long, std::ratio<1l, 1000000000l> >);[13] for (int width = 2; width 47 [17] 0.0 0.00 0.00 1 std::enable_if<std::chrono::__is_duration<std::chrono::duration<long, std::ratio<1l, 1000l> > >::value, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::duration_ca st<std::chrono::duration<long, std::ratio<1l, 1000l> >, long, std::ratio< 1l, 1000000000l> >(size std::chrono::duration<long, std::ratio< 1l, 1000000000l> > const&) [17] 48 0.00 0.00 1); width /1 std::chrono::duration<long, std::ratio<1l, 1000l> > std::chrono::__duration_cast_impl<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::ratio<1l, 1000000l>, long, true, false>: :__cast<long, std::ratio<1l, 1000000000l> >(std::chrono::duration<= 1long, std::ratio<1l, 1000000000l> > const&) {[18] 49 ----------------------------------------------- 50 0.00 0.00 1/1 std::enable_if<std::chrono::__is_duration<std::chrono::duration<long slices = size / , std::ratio<1l, 1000l> > >::value, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::duratio n_cast<std::chrono::duration<long, std::ratio<1l, 1000l> >, long, std::ratio<1l, 1000000000l> >(totalThreads * widthstd::chrono::duration<long, std::ratio<1l, 1000000000l> > const&) + [17] 51 [18] 0.0 0.00 0.00 1; merge_sort std::chrono::duration<long, std::ratio<1l, 1000l> > std::chrono::__duration_cast_impl<std::chrono::duration< long, std::ratio<nbpg1l, ntpb 1000l>> , std::ratio<1l, 1000000l> (d_actual, d_swaplong, sizetrue, widthfalse>::__c ast<long, slicesstd::ratio<1l, d_ntpb1000000000l> >(std::chrono::duration<long, d_nbpgstd::ratio<1l, 1000000000l> > const&);[18] 52 0.00 0.00 1/3 std::chrono::duration<long, std::ratio<1l, 1000000000l> >::count() const [10] |} cudaDeviceSynchronizeAs is evident, the algorithm is O(n2)currently. Using thread indices on the GPU to replace the for loops could potentially improve performance.To increase the efficiency of the program we transformed the '''cosine_transform_data''' function into a kernel named '''cosTransformKernel''' which offloads the compute intense calculation of the program to the GPU. =====Kernel Version 1====={| class="wikitable mw-collapsible mw-collapsed"! Modified Code|-| # include <iostream> # include <iomanip> # include <ctime> # include <chrono> # include <cstdlib> # include <cmath> #include <cuda_runtime.h> using namespace std; using namespace std::chrono; d_actual const double pi = d_actual 3.141592653589793; const int ntpb =1024; void cosine_transform_test01 ( int size ); double *r8vec_uniform_01_new ( int n, int &seed ){ int i; const int i4_huge = d_A ? d_B : d_A2147483647; int k; double *r; d_swap if ( seed = d_swap =0 ){ cerr << "\n"; cerr << "R8VEC_UNIFORM_01_NEW - Fatal error!\n"; cerr << " Input value of SEED = d_A ? d_B : d_A0.\n"; exit ( 1 );
}
}
}
void cosine_transform_test01 ( int size){ int n =size; int seed; double *r; double *hs; double *s =new double[n]; double *d_a; double *d_b; //allocate memory on the device for the randomly generated array and for the array in which transform values will be stored cudaMalloc((void**)&d_a,sizeof(double) * n); cudaMalloc((void**)&d_b,sizeof(double) * n); seed =123456789; r = r8vec_uniform_01_new ( n, seed ); //copy randomly generated values from host to device cudaMemcpy(d_a,r,sizeof(double)*n,cudaMemcpyHostToDevice); int nblks = (n + ntpb - 1) / ntpb; steady_clock::time_point ts, te; ts = steady_clock::now(); cosTransformKernel<<<nblks,ntpb>>>(d_a,d_b,size); cudaDeviceSynchronize(); te = steady_clock::now(); reportTime("Cosine Transform on device",te-ts); cudaMemcpy(s,d_b,sizeof(double)*n,cudaMemcpyDeviceToHost); ts = steady_clock::now(); hs = cosine_transform_data ( n, r ); te = steady_clock::now(); reportTime("Cosine Transform on host",te-ts); cudaFree(d_a); cudaFree(d_b); delete [] r; delete [] s; delete [] hs;} |} The graph for the execution time difference between the device and the host looks like: [[File:kernel1.png]] Even though the kernel includes a for-loop the execution time has decreased drastically. Thats because each thread is now responsible for one calculating one element of the final Cos transformed matrix(unit vector). === Assignment 3 === For optimizing the code better, we thought of removing the iterative loop from the kernel by using threadIdx.y to control calculation of each element's cosine for that position in the supposed matrix. The problem in this was that each thread was in a racing condition to write to the same memory location, to sum up the cosine transformations for all elements of that row. We solved this by using the atomic function. Its prototype is as follows.double atomicAdd(double* address, double value) =====Kernel Version 2=Profiling results====
{| class="wikitable mw-collapsible mw-collapsed"
! Flat profileKernel 2
|-
| # include <cmath> # include <cstdlib> # include <iostream> # include <iomanip> # include <ctime> # include <chrono> # include <cstdlib> # include <cmath> #include <limits> #include <cuda_runtime.h> #include <cuda.h> using namespace std; using namespace std::chrono; const double pi = 3.141592653589793; const unsigned ntpb = 32; void cosine_transform_test01 ( int size ); double *r8vec_uniform_01_new ( int n, int &seed ){ int i; const int i4_huge = 2147483647; int k; double *r; if ( seed == 0 ){ cerr << "\n"; cerr << "R8VEC_UNIFORM_01_NEW - Fatal error!\n"; cerr << " Input value of SEED = 0.\n"; exit ( 1 ); } r = new double[n]; for ( i = 0; i < n; i++ ){ k = seed / 127773; seed = 16807 * ( seed - k * 127773 ) - k * 2836; if ( seed < 0 ){ seed = seed + i4_huge; } r[i] = ( double ) ( seed ) * 4.656612875E-10; } return r; } double *cosine_transform_data ( int n, double d[] ){ double angle; double *c; int i; int j; c = new double[n]; for ( i = 0; i < n; i++ ){ c[i] = 0.0; for ( j = 0; j < n; j++ ){ angle = pi * ( double ) ( i * ( 2 * j + 1 ) ) / ( double ) ( 2 * n ); c[i] = c[i] + cos ( angle ) * d[j]; } c[i] = c[i] * sqrt ( 2.0 / ( double ) ( n ) ); } return c; } void reportTime(const char* msg, steady_clock::duration span) { auto ms = duration_cast<milliseconds>(span); std::cout << msg << " - took - " << ms.count() << " millisecs" << std::endl; } __global__ void cosTransformKernel(double *a, double *b, const int n){ double angle; const double pi = 3.141592653589793; int j = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.y * blockDim.y + threadIdx.y; if(i<n && j<n){ angle = pi * ( double ) ( i * ( 2 * j + 1 ) ) / ( double ) ( 2 * n ); double value = cos ( angle ) * a[j]; b[i] = atomicAdd(&b[i], value); } //square root of the whole cos transformed row term if(j==n-1 && i<n){ b[i] *= sqrt ( 2.0 / ( double ) ( n ) ); } } int main (int argc, char* argv[] ){ if (argc != 2) { std::cerr << argv[0] << ": invalid number of arguments\n"; std::cerr << "Usage: " << argv[0] << " size_of_vector\n"; return 1; } int n = std::atoi(argv[1]); cosine_transform_test01 (n); return 0; } void cosine_transform_test01 ( int size){ int n = size; int seed; double *r; double *hs; //host side pointer to store the array returned from host side cosine_transform_data, for comparison purposes double *s = new double[n]; //double *t; double *d_a; double *d_b; //allocate memory on the device for the randomly generated array and for the array in which transform values will be stored cudaMalloc((void**)&d_a,sizeof(double) * n); cudaMalloc((void**)&d_b,sizeof(double) * n); seed = 123456789; r = r8vec_uniform_01_new ( n, seed ); //copy randomly generated values from host to device for(int i=0; i<n; i++) s[i]=0.0; cudaMemcpy(d_a,r,sizeof(double)*n,cudaMemcpyHostToDevice); cudaMemcpy(d_b,s,sizeof(double)*n,cudaMemcpyHostToDevice); int nblks = (n + ntpb - 1) / ntpb; dim3 grid(nblks,nblks,1); dim3 block(ntpb,ntpb,1); steady_clock::time_point ts, te; ts = steady_clock::now(); cosTransformKernel<<<grid,block>>>(d_a,d_b,size); cudaDeviceSynchronize(); te = steady_clock::now(); reportTime("Cosine Transform on device",te-ts); cudaMemcpy(s,d_b,sizeof(double)*n,cudaMemcpyDeviceToHost); ts = steady_clock::now(); hs = cosine_transform_data ( n, r ); te = steady_clock::now(); reportTime("Cosine Transform on host",te-ts);
cudaFree(d_a);
cudaFree(d_b);
delete [] r;
delete [] s;
delete [] hs;
//delete [] t;
return;
}
|}