93
edits
Changes
GPUSquad
,→Assignment 3
[[File:Dps_915_gpusquad_ghostcellexample.png]]
Code:
<source>
#include <cuda.h>
#include "cuda_runtime.h"
#include <device_launch_parameters.h>
#include <stdio.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
#ifndef __CUDACC__
#define __CUDACC__
#endif
// Load standard libraries
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <cmath>
#include <chrono>
using namespace std;
const int ntpb = 1024;//TOTAL number of threads per block
const int ntpbXY = 32;
__device__ __constant__ int d_m;
__device__ __constant__ int d_n;
__device__ __constant__ float d_xmin;
__device__ __constant__ float d_xmax;
__device__ __constant__ float d_ymin;
__device__ __constant__ float d_ymax;
__device__ __constant__ float d_pi;
__device__ __constant__ float d_omega;
__device__ __constant__ float d_dx;
__device__ __constant__ float d_dy;
__device__ __constant__ float d_dxxinv;
__device__ __constant__ float d_dyyinv;
__device__ __constant__ float d_dcent;
__device__ __constant__ int d_nBlocks;
__device__ __constant__ int d_ntpbXY;
void check(cudaError_t err) {
if (err != cudaSuccess)
std::cerr << "ERROR: *** " << cudaGetErrorString(err) << " ****" << std::endl;
}
</source>
<pre style="color: blue">
//every thread does a single calculation--remove iterations
//-take everything from one dimensional global, and put into 2d shared--shared has ghost b/c need for threads--
//--global is 1d indexed ,shared is 2d indexed.
//TODO: remove for loop from kernel and index entirely with blockIdx, blockDim, and threadIdx (check workshop 7 for indexing by shared memory within blocks)
__global__ void matrixKernel(float* a, float* b) {//MODIFY to suit algorithm
//int j= blockDim.x*blockIdx.x + threadIdx.x+1;//TODO: CHECK THIS DOESN'T MESS UP GLOBAL EXECUTION
int j = threadIdx.y;
int i = blockDim.x*blockIdx.x + threadIdx.x;
//int i = threadIdx.y + 1;//original code has k = 1 and <=total_iters--since k will now start at 0, set end at < total_iters
//above: we are using block and thread indexes to replace some of the iteration logic
int tj = threadIdx.y;
int ti = threadIdx.x;
int tij = threadIdx.x + ntpbXY*threadIdx.y;
//__shared__ float bShared[ntpbXY*(ntpbXY+2)];//add a couple of ghost columns
__shared__ float bShared[ntpbXY+2][ntpbXY];
int ij = i + d_m*j;
//NOTES ON SHARED INDEX OFFSET:
//x offset is -1, so to get the current index for the thread for global, it is bShared[ti+i][tj]
//-to get global x index -1 is bShared[ti][tj]
//-to get global x index is bShared[ti+1][tj]
//-to get global x index +1 is bShared[ti+2][tj]
//set left ghost cells
if (threadIdx.x == 0 && blockIdx.x != 0)
bShared[ti][tj] = b[ij - 1];
//set right ghost cells
if (threadIdx.x == ntpbXY - 1 && blockIdx.x != d_nBlocks - 1)
bShared[ti+2][tj] = b[d_ntpbXY* j + i+1];
//set ghost cell for current thread relative to global memory
bShared[ti+1][tj] = b[d_m* j + i];
__syncthreads();
//SHARED MEMORY VERSION:==========
if (!(blockIdx.x == 0 && threadIdx.x == 0 || blockIdx.x == (d_nBlocks - 1) && threadIdx.x == (d_ntpbXY - 1)) && threadIdx.y != 0 && threadIdx.y != (d_ntpbXY - 1)) {
float x = d_xmin + i*d_dx, y = d_ymin + j*d_dy;
float input = abs(x) > 0.5 || abs(y) > 0.5 ? 0 : 1;
a[d_m* j + i] = (input + d_dxxinv*(bShared[ti][tj] + bShared[ti+2][tj])//TODO: does the program logic allow for this to go out of range??
+ d_dyyinv*(bShared[ti+1][tj-1] + bShared[ti+1][tj+1]))*d_dcent;
__syncthreads();
}
}
</pre>
<source>
// Set grid size and number of iterations
const int save_iters = 20;
const int total_iters = 5000;
const int error_every = 2;
//HERE:
const int m = 32, n =32 ;
const float xmin = -1, xmax = 1;
const float ymin = -1, ymax = 1;
// Compute useful constants
const float pi = 3.1415926535897932384626433832795;
const float omega = 2 / (1 + sin(2 * pi / n));
const float dx = (xmax - xmin) / (m - 1);//TODO: modify to fit kernel?
const float dy = (ymax - ymin) / (n - 1);//TODO: modify to fit kernel?
const float dxxinv = 1 / (dx*dx);
const float dyyinv = 1 / (dy*dy);
const float dcent = 1 / (2 * (dxxinv + dyyinv));
// Input function
inline float f(int i, int j) {
float x = xmin + i*dx, y = ymin + j*dy;
return abs(x) > 0.5 || abs(y) > 0.5 ? 0 : 1;
}
// Common output and error routine
void outputImage(char* filename, float *a) {
// Computes the error if sn%error every==0
// Saves the matrix if sn<=save iters
int i, j, ij = 0, ds = sizeof(float);
float x, y, data_float; const char *pfloat;
pfloat = (const char*)&data_float;
ofstream outfile;
static char fname[256];
sprintf(fname, "%s.%d", filename, 101);
outfile.open(fname, fstream::out
| fstream::trunc | fstream::binary);
data_float = m; outfile.write(pfloat, ds);
for (i = 0; i < m; i++) {
x = xmin + i*dx;
data_float = x; outfile.write(pfloat, ds);
}
for (j = 0; j < n; j++) {
y = ymin + j*dy;
data_float = y;
outfile.write(pfloat, ds);
for (i = 0; i < m; i++) {
data_float = a[ij++];
outfile.write(pfloat, ds);
}
}
outfile.close();
}
void dojacobi() {
int i, j, ij, k;
float error, z;
float *a, *b;
float *u;
float *v;
u = new float[m*n];
v = new float[m*n];
// Set initial guess to be identically zero
for (ij = 0; ij < m*n; ij++) u[ij] = v[ij] = 0;
a = v; b = u;
float* d_a;
float* d_b;
int nBlocks = ((n*m + ntpb - 1) / ntpb);
//malloc
cudaMalloc((void**)&d_a, m*n * sizeof(float));
cudaMalloc((void**)&d_b, m*n * sizeof(float));
cudaMemcpy(d_a, a, n* m * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, n* m * sizeof(float), cudaMemcpyHostToDevice);
dim3 dGrid(nBlocks, 1);//1D grid of blocks (means we don't have to ghost 1st and last rows of matrices w/i a block
dim3 dBlock(32, 32);
cudaMemcpyToSymbol(d_nBlocks, &nBlocks, sizeof(int));
cudaMemcpyToSymbol(d_ntpbXY, &ntpbXY, sizeof(int));
// Carry out Jacobi iterations
for (k = 1; k <= total_iters; k++) {
if (k % 2 == 0) {
cudaError_t error = cudaGetLastError();
//RUN KERNEL:
matrixKernel << <dGrid, dBlock >> > (d_a, d_b);
cudaDeviceSynchronize();
check(cudaGetLastError());
}
else {
cudaError_t error = cudaGetLastError();
matrixKernel << <dGrid, dBlock >> > (d_b, d_a);
cudaDeviceSynchronize();
if (cudaGetLastError()) {
std::cout << "error";
}
}
}
cudaMemcpy(a, d_a, n* m * sizeof(float), cudaMemcpyDeviceToHost);
outputImage("jacobi out", a);
cudaFree(d_a);
cudaFree(d_b);
delete[] u;
delete[] v;
}
int main() {
cudaError_t cuerr;
cuerr = cudaMemcpyToSymbol(d_m, &m, sizeof(int));
if (cudaGetLastError()) {
std::cout << cudaGetErrorString(cuerr) << std::endl;
}
</source>
<pre style="color: blue">
cudaMemcpyToSymbol(d_n, &n, sizeof(int));
cudaMemcpyToSymbol(d_xmin, &xmin, sizeof(float));
cudaMemcpyToSymbol(d_xmax, &xmax, sizeof(float));
cudaMemcpyToSymbol(d_ymin, &ymin, sizeof(float));
cudaMemcpyToSymbol(d_ymax, &ymax, sizeof(float));
cudaMemcpyToSymbol(d_pi, &pi, sizeof(float));
cudaMemcpyToSymbol(d_omega, &pi, sizeof(float));
cudaMemcpyToSymbol(d_dx, &dx, sizeof(float));
cudaMemcpyToSymbol(d_dy, &dy, sizeof(float));
</pre>
<source>
cuerr = cudaMemcpyToSymbol(d_dxxinv, &dxxinv, sizeof(float));
if (cudaGetLastError()) {
std::cout << "error" << std::endl;
std::cout << cudaGetErrorString(cuerr) << std::endl;
}
cudaMemcpyToSymbol(d_dyyinv, &dyyinv, sizeof(float));
cuerr = cudaMemcpyToSymbol(d_dcent, &dcent, sizeof(float));
if (cudaGetLastError()) {
//std::cout << "error";
std::cout << cudaGetErrorString(cuerr) << std::endl;
}
std::chrono::steady_clock::time_point ts, te;
ts = std::chrono::steady_clock::now();
dojacobi();
te = std::chrono::steady_clock::now();
std::chrono::steady_clock::duration duration = te - ts;
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(duration);
std::cout << "Parallel Code Time: " << ms.count() << " ms" << std::endl;
cudaDeviceReset();
return 0;
}
</source>