== Optimizing Problems ==
At the beginning of the program, a handle was created for grabbing the computers Cuda Device, through this handle we can extract the Cuda Device properties to examine our parameters
//list the properties__global__ void Calculate (float* u, float* un,int nx, int c, float dx, float dt)
std::cout << "Name:" << prop.name << std::endl;{
std::cout << "Compute Capability: " << prop.major << '.' << prop.minor << std::endl; std::cout << "Total Global Memory: " << propint j = blockIdx.totalGlobalMem << std::endl; std::cout << "Max Threads per block: " << propx * blockDim.maxThreadsPerBlock << std::endl; std::cout << "Clock Rate in khz: " << propx + threadIdx.clockRate << "\n\n"x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
We grab the devices properties so that we // removes from instructions because no need to do not exceed resources.this NX amount of times
The original algorithm was split into 2 kernels. The first kernel causing no problems is as follows float total = c*dt / dx;
__global__ void Initalize(double* u, double* un, int nx, int nt, double dx)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
for (int k = 0; k <= nx - 1; k++)
if (k*dx >= 0.5 && k*dx <= 1)
{
u[k * nt] = 2;
__syncthreads();
}
else
{
u[k * nt] = 1;
__syncthreads();
}
}
if (i < nx && j < nx)
{
// format for coalesced memory access
The second kernel works perfectly find for arguments less than 1024 1024 (user inputs 2 valuesint it = 1; it <= nx- 1; it++), anything higher for example an argument of 2000 2000 will crash the NVidia driver and results will be set to pre kernel launch. The kernel code is below:
''' __global__ void Calculate (double* u, double* un,int nx, int c, double dx, double dt) { for (int it = 1; it <= nx - 1; it++) { for (int k = 0; k <= nx - 1; k++) { un[k * nx + it - 1] = u[k * nx + it - 1]; } for (int m = 1; m <= nx - 1; m++) { u[0 * nx + it] = un[1 * nx + it - 1]; u[m * nx + it] = un[m * nx + it - 1] - c*dt / dx*(un[m * nx + it - 1] - un[(m - 1) * nx + it - 1]); } }}'''
if (i !== Solution to first Kernel problem == 0 || i < nx )
The problem was resulting because of this calculation, '''u[m * nx + it] = un[m * nx + it - 1] - c*dt / dx*(un[m * nx + it - 1] - un[(m - 1) * nx + it - 1]);''' Perhaps using the c, dt & dx values incorporated to many trips to global memory which caused a hang in the operation and CUDA automatically crashed the driver. To solve this problem a scalar variable (local variable) was created to store this value in registered memory for each thread to access. '''double total = c*dt / dx;'''... Now the program executes with an argument of 2000 2000 and yeilds similar results to the original program. {
''' __global__ void Calculate (double* u, double* un,int nx, int c, double dx, double dt) { double total = c*dt / dx; for (int it = 1; it <= nx - 1; it++) { for (int k = 0; k <= nx - 1; k++) { un[k i * nx + it - 1] = u[k * nx + it - 1]; } for (int m = 1; m <= nx - 1; m++) { u[0 * nx + it] = un[1 * nx + it - 1]; u[m * nx + it] = un[m * nx + it - 1] - total * ( un[m * nx + it - 1] - un[(m - 1) i * nx + it - 1] ); } }}'''
== Problem with second Kernel == __syncthreads();
Unfortunately a new problem has risen, when the argument is raised above 2000 & 2000 the NVidia driver once again crashes and I am stuck with no solution currently. u[it] = un[1 * nx + it - 1];
__syncthreads();
u[i * nx + it ] === Re un[i * nx + it- 1] - total * (un[i * nx + it - 1] - un[(i - 1) * nx + it - Parallelize ===1]);
The original parallelized solution I had was greatly flawed... So I have parallelized the code once again, this time using Thread Identifiers from the Grid. __syncthreads();
== New Kernel ==
''' __global__ void Initalize(float* u, float* un, int nx, int nt, float dx) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; if (i < nx && j < nx) { // replace k with i. because i represents the x dimension which will ascend in a range from 0 to nx. // So i * dx will essentially by equivalent to...for (int k = 0; k < nx; k++) if (i*dx >= 0.5 && i*dx <= 1) // replace k here with i for the X dimension for the same reason as above u[i * nx] = 2; else u[i * nx] = 1; } } The old code was: for (int k = 0; k <= nx - 1; k++) if (k*dx >= 0.5 && k*dx <= 1) { u[k * nx] = 2; } else { u[k * nx] = 1; }''' So I removed the for loop and simply changed the "k" in the "if" statement to represnt the threadIdx.x identifier because X is the fastest moving dimension it will range from 0 to NX like a for look. This matches the orignal Naiver output. After this a separate kernel was created with the following code...
'''int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; // The original code had the following statement:: u[m * nx + it] = un[m * nx + it - 1] - c*dt / dx*(un[m * nx + it - 1] - un[(m - 1) * nx + it - 1]); // Rather than having each thread perform this calculation which will be an additional 2 instructions per thread, i have just stored it in a variable float total = c*dt / dx; if (i < nx && j < nx) { // The original code as can be seen below is basically copying array un to array u. So i arranged the threads to do the same un[j * nx + i] = u[j * nx + i]; __syncthreads(); if (i != 0) { // This part was a bit trickier. As seen in the original code below array u would access all threads in the [0,0] [0,1] [0,2] etc... // And copy a value from array un's [1,1] [1,2] [1,3]..etc range. The trick here was the -1 difference at the end // Because in the original for look, (it) starts at the value 1, I added and if condition to make sure the threads don't perform the operation on the thread of value 0. But it can still be access through the -1 operator. u[i] = un[1 * nx + i-1]; __syncthreads(); } }'''
Compared to the original code... for (int it = 1; it <= nx - 1; it++) { for (int k = 0; k <= nx - 1; k++) { un[k * nx + it - 1] = u[k * nx + it - 1]; } for (int m = 1; m <= nx - 1; m++) { u[0 * nx + it] = un[1 * nx + it - 1]; u[m * nx + it] = un[m * nx + it - 1] - c*dt / dx*(un[m * nx + it - 1] - un[(m - 1) * nx + it - 1]); } } }