1
edit
Changes
Hu3Team
,no edit summary
Based on assignment 1, we added the source code to make possible the program to be executed on a CUDA device, as follows.
<pre>
__global__ void copyMat(const double *w, double *u){
}
</pre>
Moreover, we made the input of the error tolerance (Epsilon) to be set on the code. After lots of difficulties found while we were coding, we finally got good results in comparison with the code of assignment 1. The runtime was decreased, and it made us to see the power that CUDA may provide to optimize the processing.
=== Assignment 3 ===
====CUDA Coding====
Based on assignment 2, we made optimizations to speed up the execution, as follows.
<pre>
__global__ void copyMat(const float *w, float *u){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < M && j < N) {
u[i * M + j] = w[i * M + j];
}
__syncthreads();
}
__global__ void calcHeat(float *w, float *u, float *d, int m, int n, float* d_array){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
__shared__ float s_u[ntpb][ntpb];
__shared__ float s_w[ntpb][ntpb];
__shared__ float s_dif[ntpb][ntpb];
if (tx < ntpb && ty < ntpb) {
s_w[ty][tx] = w[j * M + i];
s_u[ty][tx] = w[j * M + i];
}
__syncthreads();
if ( ( tx < (ntpb-1) && ty < (ntpb-1) ) && ( tx >0 && ty > 0 ) && ( i < M && j < N ) ) {
s_w[ty][tx] = ( s_u[ty - 1][tx] + s_u[ty + 1][tx] + s_u[ty][tx - 1] + s_u[ty][tx + 1] ) / 4.0;
s_dif[ty][tx] = fabsf(s_w[ty][tx] - s_u[ty][tx]);
//if (s_dif[ty][tx] < 0){ s_dif[ty][tx] *= -1; }
}
__syncthreads();
if (tx < ntpb && ty < ntpb) {
w[j * M + i] = s_w[ty][tx];
//u[j * M + i] = s_w[ty][tx];
d_array[j * M + i] = s_dif[ty][tx];
}
__syncthreads();
}
__global__ void bigDiff(float* d_array, float* d, int m, int n){
int i = blockIdx.x * blockDim.x + threadIdx.x;
for (int x = 1; i + x < m*n; x *= 2) {
if (d_array[i] > *d || d_array[i + x] > *d){
if (d_array[i] > d_array[i + x])
*d = d_array[i];
else
*d = d_array[i + x];
}
__syncthreads();
}
}
</pre>
====Comparing the results====
As a result of the source code included, it was possible to reduce the processing time.
[[File:runtime-CUDA.JPG]]