212
edits
Changes
BetaT
,→Parallelizing with 2 Kernels
I have removed the 2 inner for loops but kept the outer loop.
The program takes 2 arrays. Let us say the X's represent the arrays below
__global__ void Calculate (float* u, float* un,int nx, int c, float dx, float dt)
{
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
// removes from instructions because no need to do this NX amount of times
float total = c*dt / dx;
if (i < nx && j < nx)
{
for (int it = 1; it <= nx- 1; it++)
{
if (i != 0 || i < nx )
{
un[i * nx + it-1] = u[i * nx + it-1];
__syncthreads();
u[it] = un[1 * nx + it - 1];
__syncthreads();
u[i * nx + it ] = un[i * nx + it- 1] - c*dt / dx* (un[i * nx + it - 1] - un[(i - 1) * nx + it - 1]);
__syncthreads();
}
}
}
Array 1 Array 2
oxxxx oxxxx
2nd: Array 1 will set the values in its first row [0,1] (marked by 2) index to the values in Array 2's 1st column and 1st [1,0] (marked by )index.
Array 1 Array 2
oxxxx oxxxx
3rd: Finally Next Array 1 will calculate its 2nd next column (marked by the 3) by performing a calculation as shown above on 2 columns in Array 2 which will be represented by the Thread Identifier as the X dimension and the Y dimension being represented 's first column (marked by the for loop iterator3 ).
Array 1 Array 2
o3xxx 33xxx 3xxxx
o3xxx 33xxx o3xxx 33xxx o3xxx 33xxx o3xxx 33xxx __global__ void Calculate (float* u, float* un,int nx, int c, float dx, float dt) { int j = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.y * blockDim.y + threadIdx.y; // removes from instructions because no need to do this NX amount of times float total = c*dt / dx; if (i < nx && j < nx) { for (int it = 1; it <= nx- 1; it++) { if (i != 0 || i < nx ) { un[i * nx + it-1] = u[i * nx + it-1]; __syncthreads(); u[it] = un[1 * nx + it - 1]; __syncthreads(); u[i * nx + it ] = un[i * nx + it- 1] - c*dt / dx* (un[i * nx + it - 1] - un[(i - 1) * nx + it - 1]); __syncthreads(); } } }3xxxx
o3xxx 3xxxx
o3xxx 3xxxx