</source>
<nowiki>****************</nowiki>
CODE FOR 1D BLOCK OF 1D THREADS WITH CONSTANT MEMORY:
we tried just replacing the global memory values that were used as calculation constants with constant memory, since these values would not have to be modified by kernel logic.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
// Load standard libraries
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <cmath>
#include <chrono>
using namespace std;
<pre style="color: blue">
__device__ __constant__ int d_m;
__device__ __constant__ int d_n;
__device__ __constant__ float d_xmin;
__device__ __constant__ float d_xmax;
__device__ __constant__ float d_ymin;
__device__ __constant__ float d_ymax;
__device__ __constant__ float d_pi;
__device__ __constant__ float d_omega;
__device__ __constant__ float d_dx;
__device__ __constant__ float d_dy;
__device__ __constant__ float d_dxxinv;
__device__ __constant__ float d_dyyinv;
__device__ __constant__ float d_dcent;
</pre>
__global__ void matrixKernel(float* a, float* b){//, float dcent, int n, int m, int xmin, int xmax,
//int ymin, int ymax, float dx, float dy, float dxxinv, float dyyinv) {//MODIFY to suit algorithm
int j = threadIdx.x + 1;
//above: we are using block and thread indexes to replace some of the iteration logic
if (j < d_n - 1) {
for (int i = 1; i < d_m - 1; i++) {
int ij = i + d_m*j;
float x = d_xmin + i*d_dx, y = d_ymin + j*d_dy;
float input = abs(x) > 0.5 || abs(y) > 0.5 ? 0 : 1;
a[ij] = (input + d_dxxinv*(b[ij - 1] + b[ij + 1])
+ d_dyyinv*(b[ij - d_m] + b[ij + d_m]))*d_dcent;
}
}
}
// Set grid size and number of iterations
const int save_iters = 20;
const int total_iters = 5000;
const int error_every = 2;
const int m = 32, n = 1024;
const float xmin = -1, xmax = 1;
const float ymin = -1, ymax = 1;
// Compute useful constants
const float pi = 3.1415926535897932384626433832795;
const float omega = 2 / (1 + sin(2 * pi / n));
const float dx = (xmax - xmin) / (m - 1);
const float dy = (ymax - ymin) / (n - 1);
const float dxxinv = 1 / (dx*dx);
const float dyyinv = 1 / (dy*dy);
const float dcent = 1 / (2 * (dxxinv + dyyinv));
// Input function
inline float f(int i, int j) {
float x = xmin + i*dx, y = ymin + j*dy;
return abs(x) > 0.5 || abs(y) > 0.5 ? 0 : 1;
}
// Common output and error routine
void outputImage(char* filename, float *a) {
// Computes the error if sn%error every==0
// Saves the matrix if sn<=save iters
int i, j, ij = 0, ds = sizeof(float);
float x, y, data_float; const char *pfloat;
pfloat = (const char*)&data_float;
ofstream outfile;
static char fname[256];
sprintf(fname, "%s.%d", filename, 101);
outfile.open(fname, fstream::out
| fstream::trunc | fstream::binary);
data_float = m; outfile.write(pfloat, ds);
for (i = 0; i < m; i++) {
x = xmin + i*dx;
data_float = x; outfile.write(pfloat, ds);
}
for (j = 0; j < n; j++) {
y = ymin + j*dy;
data_float = y;
outfile.write(pfloat, ds);
for (i = 0; i < m; i++) {
data_float = a[ij++];
outfile.write(pfloat, ds);
}
}
outfile.close();
}
void dojacobi() {
int i, j, ij, k;
float error, z;
float *a, *b;
float *u;
float *v;
u = new float[m*n];
v = new float[m*n];
// Set initial guess to be identically zero
for (ij = 0; ij < m*n; ij++) u[ij] = v[ij] = 0;
a = v; b = u;
float* d_a;
float* d_b;
//malloc
cudaMalloc((void**)&d_a, m*n * sizeof(float));
cudaMalloc((void**)&d_b, m*n * sizeof(float));
cudaMemcpy(d_a, a, n* m * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, n* m * sizeof(float), cudaMemcpyHostToDevice);
int nblocks = n / 1024;
dim3 dGrid(nblocks);
dim3 dBlock(1024);
// Carry out Jacobi iterations
for (k = 1; k <= total_iters; k++) {
if (k % 2 == 0) {
cudaError_t error = cudaGetLastError();
matrixKernel << <dGrid, dBlock >> > (d_a, d_b);// , dcent, n, m, xmin, xmax, ymin, ymax, dx, dy, dxxinv, dyyinv);
cudaDeviceSynchronize();
if (cudaGetLastError()) {
std::cout << "error";
}
}
else {
cudaError_t error = cudaGetLastError();
matrixKernel << <dGrid, dBlock >> > (d_b, d_a);// , dcent, n, m, xmin, xmax, ymin, ymax, dx, dy, dxxinv, dyyinv);
cudaDeviceSynchronize();
if (cudaGetLastError()) {
std::cout << "error";
}
}
}
cudaMemcpy(a, d_a, n* m * sizeof(float), cudaMemcpyDeviceToHost);
outputImage("jacobi out", a);
cudaFree(d_a);
cudaFree(d_b);
delete[] u;
delete[] v;
}
int main() {
cudaMemcpyToSymbol(&d_n, &n, sizeof(int));
cudaMemcpyToSymbol(&d_xmin, &xmin, sizeof(float));
cudaMemcpyToSymbol(&d_xmax, &xmax, sizeof(float));
cudaMemcpyToSymbol(&d_ymin, &ymin, sizeof(float));
cudaMemcpyToSymbol(&d_ymax, &ymax, sizeof(float));
cudaMemcpyToSymbol(&d_pi, &pi, sizeof(float));
cudaMemcpyToSymbol(&d_omega, &pi, sizeof(float));
cudaMemcpyToSymbol(&d_dx, &dx, sizeof(float));
cudaMemcpyToSymbol(&d_dy, &dy, sizeof(float));
std::chrono::steady_clock::time_point ts, te;
ts = std::chrono::steady_clock::now();
dojacobi();
te = std::chrono::steady_clock::now();
std::chrono::steady_clock::duration duration = te - ts;
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(duration);
std::cout << "Parallel Code Time: " << ms.count() << " ms" << std::endl;
cudaDeviceReset();
return 0;
}
<nowiki>****************</nowiki>
FINAL TIMINGS <pre style="color: red"> THE GRAPH IMMEDIATELY BELOW IS INCORRECT: there was an error recording the 1D runtimes for assignment 2</pre>
[[File:Code_timings.png]]
The above graph includes the total run times for the serial code, the 1D kernel from assignment 2, the 1d kernel using constant memory for calculation constants, a kernel with global and constant memory with a 2D thread arrangement, and the same 2D arrangement but with shared memory utilizing ghost cells. We found that the most efficient version of the code was the 1D implementation that used constant memory. Because the shared memory version of the kernel required synchronization of threads to allocate shared memory every time a kernel was run, and a kernel was run 5000 times for each version of our code, this increased overhead for memory setup actually made the execution slower than the version with global memory.
The 1D design ran better than We found that the most efficient version of the code was the 2d implementation version that used constant memory and did not use shared memory. Because the shared memory version of the kernel required synchronization of threads to allocate shared memory every time a kernel was run, and a kernel was run 5000 times for a couple each version of reasons (including that it scaled along our code, this increased overhead for memory setup actually made the execution slower than the m dimension, which still produced readable graphs)version with global memory.
One of the issues encountered when trying to profile code was the fact that different group members were trying to work with different hardware. The hardware changed based on the rooms we ended up profiling in (open lab vs lab computers vs laptops with different video cards). The above graph was done on an open lab computer with a QuadroK620 card.
[TODO: INCLUDE PROFILING BREAKDOWNS OF INDIVIDUAL (NOT 5000) KERNEL RUNS TO SEE SPECIFIC TIMELINE FEATURES. EXPLAIN THE DIFFERENCES IN RUN TIMES]