44
edits
Changes
→Assignment 2
=== Assignment 2 ===
====== Source Files ======
{| class="wikitable mw-collapsible mw-collapsed"
! poisson-pcie.cu
|-
|
<source>
/*
* Poisson Method using two arrays.
* Non-Ghost Cells Method
* Multiple PCIe Calls made, once per iteration
* by Tony Sim
*/
#include <cstring>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <string>
#include <cuda_runtime.h>
#include "poisson.cuh"
namespace DPS{
Poisson::Poisson(std::ifstream& ifs) {
std::string line;
nColumns = 0;
bufferSide = 0;
nRowsTotal = 0;
/* find number of columns */
std::getline(ifs,line);
for (size_t i = 0 ; i < line.size() ; i++){
if(line[i]==' ') nColumns++;
}
nColumns++;
/* find number of rows */
nRowsTotal++; /* already fetched one */
while(std::getline(ifs,line))
nRowsTotal++;
ifs.clear();
try{
for (size_t i = 0 ; i < 2 ; i++)
h_data[i] = new float[ (nColumns+2) * (nRowsTotal+2)]; /* add edge buffers */
}
catch (...){
throw std::runtime_error("Failed to Allocate Memory");
}
/* readin data */
std::cout <<"Reading in data"<<std::endl;
ifs.seekg(0,ifs.beg);
/* allocate memory to all but the edge buffer, index 0 and max for each row and column */
for (size_t i = 0 ; i < nRowsTotal+2 ; i++){
for (size_t j = 0 ; j < nColumns+2 ; j++){
float val = 0;
if(!(i == 0 || i == nRowsTotal + 1 || j == 0 || j == nColumns + 1))
ifs >> val;
h_data[0][i*(nColumns+2)+j] = val;
}
}
std::cout <<"Setting buffer"<<std::endl;
std::memset(h_data[1],0,(nRowsTotal+2)*(nColumns+2)*sizeof(float));
bool state = devMemSet();
}
Poisson::Poisson(const size_t r, const size_t c, float* d) {
bufferSide = 0;
nRowsTotal = r;
nColumns = c;
try{
h_data[0] = new float[(r+2)*(c+2)];
h_data[1] = new float[(r+2)*(c+2)];
}
catch (...){
throw std::runtime_error("Failed to Allocate Memory");
}
std::memcpy(h_data[0],d,(r+2)*(c+2)*sizeof(float));
std::memset(h_data[1],0,(r+2)*(c+2)*sizeof(float));
devMemSet();
}
Poisson::~Poisson(){
for( size_t i = 0 ; i < 2 ; i++){
delete [] h_data[i];
cudaFree(d_data[i]);
}
}
bool Poisson::devMemSet(){
for(size_t i = 0 ; i < 2 ; i++){
cudaMalloc(&d_data[i],(nColumns+2)*(nRowsTotal+2)*sizeof(float));
if(d_data[i] != nullptr){
cudaError_t state = cudaMemcpy((void*)d_data[i],(const void*)h_data[i],(nColumns+2)*(nRowsTotal+2)*sizeof(float),cudaMemcpyHostToDevice);
if(state != cudaSuccess)
std::cerr << "ERROR on devMemSet for : " << i <<" with : " << cudaGetErrorString(state)<< std::endl;
}
}
return d_data[0]&&d_data[1];
}
float* Poisson::operator()(const size_t nIterations, const float wx, const float wy){
/* calculate the grid, block, where block has 1024 threads total */
unsigned int blockx = 32;
unsigned int blocky = 32;
unsigned int gridx = ((nRowsTotal+2)+blockx-1)/blockx;
unsigned int gridy = ((nRowsTotal+2)+blocky-1)/blocky;
/* create dim3 */
dim3 dBlock= {blockx,blocky};
dim3 dGrid = {gridx,gridy};
/* run iterations */
for (size_t i = 0; i < nIterations; i++) {
update<<<dGrid,dBlock>>>(d_data[1-bufferSide],d_data[bufferSide],nColumns, nRowsTotal, wx, wy);
bufferSwitch();
}
/* DEBUG */ h_data[bufferSide][1*(nColumns+2) + 1] = 100.0f;
/* output results from device to host */
cudaError_t state = cudaMemcpy(h_data[bufferSide],d_data[bufferSide],(nColumns+2)*(nRowsTotal+2)*sizeof(float),cudaMemcpyDeviceToHost);
if(state != cudaSuccess)
std::cout << "ERROR on () when copying data back to host" <<" with : " << cudaGetErrorString(state)<< std::endl;
return h_data[bufferSide];
}
void Poisson::show(std::ostream& ofs) const{
ofs << std::fixed << std::setprecision(1);
for (size_t j = 1; j <= nColumns ; j++) {
for (size_t i = 1 ; i <= nRowsTotal ; i++)
ofs << std::setw(8) << h_data[bufferSide][i * (nColumns+2) + j]<<",";
ofs << std::endl;
}
}
__global__ void update (float* newD, const float* currD, int nCol, int nRow, const float wx, const float wy){
size_t i = blockDim.x * blockIdx.x + threadIdx.x + 1; /* for x axis */
size_t j = blockDim.y * blockIdx.y + threadIdx.y + 1; /* for y axis */
newD[i*(nCol+2)+j] = currD[i * (nCol+2) +j] + wx*(currD[(i+1) * (nCol+2) +j] + currD[(i-1) * (nCol+2) +j] - 2.0f * currD[i * (nCol+2) +j] ) + wy*( currD[i * (nCol+2) +j+1] + currD[i * (nCol+2) +j-1] - 2.0f * currD[i * (nCol+2) +j]) ;
__syncthreads();
}
}
</source>
|}
{| class="wikitable mw-collapsible mw-collapsed"
! poisson-alt.cu
|-
|
<source>
/*
* Poisson Method using two arrays.
* Non-Ghost Cells Method
* One PCIe Call made, iterations done in kernel
* by Tony Sim
*/
#include <cstring>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <string>
#include <cuda_runtime.h>
#include "poisson-alt.cuh"
namespace DPS{
Poisson::Poisson(std::ifstream& ifs) {
blockx = 32;
blocky = 32;
std::string line;
nColumns = 0;
bufferSide = 0;
nRowsTotal = 0;
/* find number of columns */
std::getline(ifs,line);
for (size_t i = 0 ; i < line.size() ; i++){
if(line[i]==' ') nColumns++;
}
nColumns++;
/* find number of rows */
nRowsTotal++; /* already fetched one */
while(std::getline(ifs,line))
nRowsTotal++;
ifs.clear();
int sizeX = ((nColumns + 2 + blockx + 2 - 1)/(blockx+2))*(blockx+2);
int sizeY = ((nRowsTotal + 2 + blocky + 2 - 1)/(blocky+2))*(blocky+2);
bufferSize = sizeX * sizeY;
std::cout << "Allocate initial memory" << std::endl;
try{
h_data = new float[ bufferSize ]; /* add edge buffers */
}
catch (...){
throw std::runtime_error("Failed to Allocate Memory");
}
/* readin data */
std::cout <<"Reading in data"<<std::endl;
ifs.seekg(0,ifs.beg);
/* allocate memory to all but the edge buffer, index 0 and max for each row and column */
std::memset(h_data,0,bufferSize);
for (size_t i = 0 ; i < nRowsTotal+2 ; i++){
for (size_t j = 0 ; j < nColumns+2 ; j++){
float val = 0;
if(!(i == 0 || i == nRowsTotal + 1 || j == 0 || j == nColumns + 1))
ifs >> val;
h_data[i*(nColumns+2)+j] = val;
}
}
std::cout <<"Setting buffer"<<std::endl;
bool state = devMemSet();
}
Poisson::Poisson(const size_t r, const size_t c, float* d) {
bufferSide = 0;
nRowsTotal = r;
nColumns = c;
try{
h_data = new float[(r+2)*(c+2)];
}
catch (...){
throw std::runtime_error("Failed to Allocate Memory");
}
std::memcpy(h_data,d,(r+2)*(c+2)*sizeof(float));
devMemSet();
}
Poisson::~Poisson(){
delete [] h_data;
cudaFree(d_data);
cudaDeviceReset();
}
bool Poisson::devMemSet(){
/* create double buffer */
cudaMalloc(&d_data,2* bufferSize * sizeof(float));
if(d_data != nullptr){
/* copy the initial information to the first buffer */
cudaError_t state = cudaMemcpy((void*)d_data,(const void*)h_data, bufferSize * sizeof(float),cudaMemcpyHostToDevice);
if(state != cudaSuccess)
std::cerr << "ERROR on devMemSet at cudaMemcpy : " << cudaGetErrorString(state)<< std::endl;
/* set the second buffer to zero */
state = cudaMemset( d_data + bufferSize , 0, bufferSize * sizeof(float));
if(state != cudaSuccess)
std::cerr << "ERROR on devMemSet at cudaMemset : " << cudaGetErrorString(state)<< std::endl;
}
return d_data;
}
float* Poisson::operator()(const size_t nIterations, const float wx, const float wy){
/* calculate the grid, block, where block has 1024 threads total */
unsigned int gridx = ((nRowsTotal+2)+blockx-1)/blockx;
unsigned int gridy = ((nRowsTotal+2)+blocky-1)/blocky;
/* create dim3 */
dim3 dBlock= {blockx,blocky};
dim3 dGrid = {gridx,gridy};
/* run iterations */
update<<<dGrid,dBlock>>>(d_data,nColumns, nRowsTotal, wx, wy,nIterations,bufferSize);
/*DEBUG */ h_data[2*(nColumns+2)+2] = 100.0f;
/* output results from device to host */
cudaError_t state = cudaMemcpy(h_data,d_data,(nColumns+2)*(nRowsTotal+2)*sizeof(float),cudaMemcpyDeviceToHost);
if(state != cudaSuccess)
std::cout << "ERROR on () when copying data back to host with : " << cudaGetErrorString(state)<< std::endl;
return h_data;
}
void Poisson::show(std::ostream& ofs) const{
ofs << std::fixed << std::setprecision(1);
for (size_t j = 1; j <= nColumns ; j++) {
for (size_t i = 1 ; i <= nRowsTotal ; i++)
ofs << std::setw(8) << h_data[i * (nColumns+2) + j]<<",";
ofs << std::endl;
}
}
__global__ void update (float* data, int nCol, int nRow, const float wx, const float wy, unsigned int nIterations, unsigned int bufferSize){
size_t i = blockDim.x * blockIdx.x + threadIdx.x + 1; /* for x axis */
size_t j = blockDim.y * blockIdx.y + threadIdx.y + 1; /* for y axis */
unsigned int buffer = 0;
/* run iterations */
for (unsigned int n = 0 ; n < nIterations; n++){
/* Calculate and store into the other buffer */
data[(1-buffer)*bufferSize + i*(nCol+2)+j] = data[buffer*bufferSize + i * (nCol+2)+ j]
+ wx * (data[buffer*bufferSize + (i+1) * (nCol+2) +j] + data[buffer*bufferSize + (i-1) * (nCol+2) + j] - 2.0f * data[buffer*bufferSize + i * (nCol+2)+ j])
+ wy * (data[buffer*bufferSize + i * (nCol+2) + j + 1] + data[buffer*bufferSize + i * (nCol+2) + j - 1] - 2.0f * data[buffer*bufferSize + i * (nCol+2)+ j]);
__syncthreads();
/* flip buffer */
buffer = 1-buffer;
}
/* copy the output back into global memory */
data[i*(nCol+2)+j] = data[buffer * bufferSize + i * (nCol+2) + j ];
__syncthreads();
}
}
</source>
|}
{| class="wikitable mw-collapsible mw-collapsed"
! Poisson PCIe Profile
|-
|
<source>
</source>
|}
{| class="wikitable mw-collapsible mw-collapsed"
! Poisson AltProfile
|-
|
<source>
</source>
|}
[[File:Gc-spa.png | 800px]]
=== Assignment 3 ===