Changes

BetaT

5,668 bytes removed, 18:48, 3 April 2017

→‎Parallelize

== Optimizing Problems ==

~~At the beginning of the program, a handle was created for grabbing the computers Cuda Device, through this handle we can extract the Cuda Device properties to examine our parameters~~

~~//list the properties~~__global__ void Calculate (float* u, float* un,int nx, int c, float dx, float dt)

~~std::cout << "Name:" << prop.name << std::endl;~~{

~~std::cout << "Compute Capability: " << prop.major << '.' << prop.minor << std::endl;~~ ~~std::cout << "Total Global Memory: " << prop~~int j = blockIdx.~~totalGlobalMem << std::endl;~~ ~~std::cout << "Max Threads per block: " << prop~~x * blockDim.~~maxThreadsPerBlock << std::endl;~~ ~~std::cout << "Clock Rate in khz: " << prop~~x + threadIdx.~~clockRate << "\n\n"~~x;

int i = blockIdx.y * blockDim.y + threadIdx.y;

~~We grab the devices properties so that we~~ // removes from instructions because no need to do ~~not exceed resources.~~this NX amount of times

~~The original algorithm was split into 2 kernels. The first kernel causing no problems is as follows~~ float total = c*dt / dx;

~~__global__ void Initalize(double* u, double* un, int nx, int nt, double dx)~~

{

~~int i = blockIdx.x * blockDim.x + threadIdx.x;~~

~~int j = blockIdx.y * blockDim.y + threadIdx.y;~~

~~for (int k = 0; k <= nx - 1; k++)~~

~~if (k*dx >= 0.5 && k*dx <= 1)~~

{

~~u[k * nt] = 2;~~

~~__syncthreads();~~

}

~~else~~

{

~~u[k * nt] = 1;~~

~~__syncthreads();~~

}

if (i < nx && j < nx)

{

// format for coalesced memory access

~~The second kernel works perfectly find~~ for ~~arguments less than 1024 1024~~ (~~user inputs 2 values~~int it = 1; it <= nx- 1; it++)~~, anything higher for example an argument of 2000 2000 will crash the NVidia driver and results will be set to pre kernel launch. The kernel code is below:~~

~~'''~~ ~~__global__ void Calculate (double* u, double* un,int nx, int c, double dx, double dt)~~ { ~~for (int it = 1; it <= nx - 1; it++)~~ { ~~for (int k = 0; k <= nx - 1; k++)~~ { ~~un[k * nx + it - 1] = u[k * nx + it - 1];~~ } ~~for (int m = 1; m <= nx - 1; m++)~~ { ~~u[0 * nx + it] = un[1 * nx + it - 1];~~ ~~u[m * nx + it] = un[m * nx + it - 1] - c*dt / dx*(un[m * nx + it - 1] - un[(m - 1) * nx + it - 1]);~~ } }~~}'''~~

if (i !=~~= Solution to first Kernel problem ==~~ 0 || i < nx )

The problem was resulting because of this calculation, '''u[m * nx + it] = un[m * nx + it - 1] - c*dt / dx*(un[m * nx + it - 1] - un[(m - 1) * nx + it - 1]);''' Perhaps using the c, dt & dx values incorporated to many trips to global memory which caused a hang in the operation and CUDA automatically crashed the driver. To solve this problem a scalar variable (local variable) was created to store this value in registered memory for each thread to access. '''double total = c*dt / dx;'''... Now the program executes with an argument of 2000 2000 and yeilds similar results to the original program. {

~~'''~~ ~~__global__ void Calculate (double* u, double* un,int nx, int c, double dx, double dt)~~ { ~~double total = c*dt / dx;~~ ~~for (int it = 1; it <= nx - 1; it++)~~ { ~~for (int k = 0; k <= nx - 1; k++)~~ { un[k i * nx + it - 1] = u[~~k * nx + it - 1];~~ } ~~for (int m = 1; m <= nx - 1; m++)~~ { ~~u[0 * nx + it] = un[1 * nx + it - 1];~~ ~~u[m * nx + it] = un[m * nx + it - 1] - total * ( un[m * nx + it - 1] - un[(m - 1)~~ i * nx + it - 1] ); } }~~}'''~~

~~== Problem with second Kernel ==~~ __syncthreads();

~~Unfortunately a new problem has risen, when the argument is raised above 2000 & 2000 the NVidia driver once again crashes and I am stuck with no solution currently.~~ u[it] = un[1 * nx + it - 1];

__syncthreads();

u[i * nx + it ] =~~== Re~~ un[i * nx + it- 1] - total * (un[i * nx + it - 1] - un[(i - 1) * nx + it - ~~Parallelize ===~~1]);

~~The original parallelized solution I had was greatly flawed... So I have parallelized the code once again, this time using Thread Identifiers from the Grid.~~ __syncthreads();

~~== New Kernel ==~~

~~'''~~ ~~__global__ void Initalize(float* u, float* un, int nx, int nt, float dx)~~ { ~~int i = blockIdx.x * blockDim.x + threadIdx.x;~~ ~~int j = blockIdx.y * blockDim.y + threadIdx.y;~~ ~~if (i < nx && j < nx)~~ { ~~// replace k with i. because i represents the x dimension which will ascend in a range from 0 to nx.~~ ~~// So i * dx will essentially by equivalent to...for (int k = 0; k < nx; k++)~~ ~~if (i*dx >= 0.5 && i*dx <= 1)~~ ~~// replace k here with i for the X dimension for the same reason as above~~ ~~u[i * nx] = 2;~~ ~~else~~ ~~u[i * nx] = 1;~~ } } ~~The old code was:~~ ~~for (int k = 0; k <= nx - 1; k++)~~ ~~if (k*dx >= 0.5 && k*dx <= 1)~~ { ~~u[k * nx] = 2;~~ } ~~else~~ { ~~u[k * nx] = 1;~~ ~~}'''~~ ~~So I removed the for loop and simply changed the "k" in the "if" statement to represnt the threadIdx.x identifier because X is the fastest moving dimension~~ ~~it will range from 0 to NX like a for look. This matches the orignal Naiver output.~~ ~~After this a separate kernel was created with the following code...~~

~~'''int i = blockIdx.x * blockDim.x + threadIdx.x;~~ ~~int j = blockIdx.y * blockDim.y + threadIdx.y;~~ ~~// The original code had the following statement:: u[m * nx + it] = un[m * nx + it - 1] - c*dt / dx*(un[m * nx + it - 1] - un[(m - 1) * nx + it - 1]);~~ ~~// Rather than having each thread perform this calculation which will be an additional 2 instructions per thread, i have just stored it in a variable~~ ~~float total = c*dt / dx;~~ ~~if (i < nx && j < nx)~~ { ~~// The original code as can be seen below is basically copying array un to array u. So i arranged the threads to do the same~~ ~~un[j * nx + i] = u[j * nx + i];~~ ~~__syncthreads();~~ ~~if (i != 0)~~ { ~~// This part was a bit trickier. As seen in the original code below array u would access all threads in the [0,0] [0,1] [0,2] etc...~~ ~~// And copy a value from array un's [1,1] [1,2] [1,3]..etc range. The trick here was the -1 difference at the end~~ // Because in the original for look, (it) starts at the value 1, I added and if condition to make sure the threads don't perform the operation on the thread of value 0. But it can still be access through the -1 operator. ~~u[i] = un[1 * nx + i-1];~~ ~~__syncthreads();~~ } ~~}'''~~

~~Compared to the original code...~~ ~~for (int it = 1; it <= nx - 1; it++)~~ { ~~for (int k = 0; k <= nx - 1; k++)~~ { ~~un[k * nx + it - 1] = u[k * nx + it - 1];~~ } ~~for (int m = 1; m <= nx - 1; m++)~~ { ~~u[0 * nx + it] = un[1 * nx + it - 1];~~ ~~u[m * nx + it] = un[m * nx + it - 1] - c*dt / dx*(un[m * nx + it - 1] - un[(m - 1) * nx + it - 1]);~~ } } }

Jadach1

212

edits

Changes

BetaT

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools