Difference between revisions of "Ghost Cells"
(→Assignment 3) |
(→Assignment 3) |
||
Line 1,190: | Line 1,190: | ||
=== Assignment 3 === | === Assignment 3 === | ||
+ | ==== Source Codes ==== | ||
{| class="wikitable mw-collapsible mw-collapsed" | {| class="wikitable mw-collapsible mw-collapsed" | ||
! PCIe Optimization | ! PCIe Optimization | ||
Line 1,343: | Line 1,344: | ||
</source> | </source> | ||
|} | |} | ||
+ | |||
+ | |||
+ | {| class="wikitable mw-collapsible mw-collapsed" | ||
+ | ! For-loop Optimization | ||
+ | |- | ||
+ | | | ||
+ | <source> | ||
+ | /* | ||
+ | * Poisson Method using two arrays. | ||
+ | * Non-Ghost Cells Method | ||
+ | * Multiple PCIe Calls made, once per iteration | ||
+ | * by Tony Sim | ||
+ | */ | ||
+ | #include <cstring> | ||
+ | #include <cstdlib> | ||
+ | #include <iomanip> | ||
+ | #include <iostream> | ||
+ | #include <string> | ||
+ | #include <cuda_runtime.h> | ||
+ | #include "poisson-alt-ghost2.cuh" | ||
+ | |||
+ | namespace DPS{ | ||
+ | |||
+ | Poisson::Poisson(std::ifstream& ifs) { | ||
+ | blockx = 32; | ||
+ | blocky = 32; | ||
+ | |||
+ | std::string line; | ||
+ | nColumns = 0; | ||
+ | bufferSide = 0; | ||
+ | nRowsTotal = 0; | ||
+ | /* find number of columns */ | ||
+ | std::getline(ifs,line); | ||
+ | for (size_t i = 0 ; i < line.size() ; i++){ | ||
+ | if(line[i]==' ') nColumns++; | ||
+ | } | ||
+ | nColumns++; | ||
+ | |||
+ | /* find number of rows */ | ||
+ | nRowsTotal++; /* already fetched one */ | ||
+ | while(std::getline(ifs,line)) | ||
+ | nRowsTotal++; | ||
+ | ifs.clear(); | ||
+ | |||
+ | int sizeX = ((nColumns + 2 + blockx + 2 - 1)/(blockx+2))*(blockx+2); | ||
+ | int sizeY = ((nRowsTotal + 2 + blocky + 2 - 1)/(blocky+2))*(blocky+2); | ||
+ | bufferSize = sizeX * sizeY; | ||
+ | std::cout << "Allocate initial memory" << std::endl; | ||
+ | try{ | ||
+ | h_data = new float[ bufferSize ]; /* add edge buffers */ | ||
+ | } | ||
+ | catch (...){ | ||
+ | throw std::runtime_error("Failed to Allocate Memory"); | ||
+ | } | ||
+ | |||
+ | /* readin data */ | ||
+ | std::cout <<"Reading in data"<<std::endl; | ||
+ | ifs.seekg(0,ifs.beg); | ||
+ | /* allocate memory to all but the edge buffer, index 0 and max for each row and column */ | ||
+ | std::memset(h_data,0,bufferSize); | ||
+ | for (size_t i = 0 ; i < nRowsTotal+2 ; i++){ | ||
+ | for (size_t j = 0 ; j < nColumns+2 ; j++){ | ||
+ | float val = 0; | ||
+ | if(!(i == 0 || i == nRowsTotal + 1 || j == 0 || j == nColumns + 1)) | ||
+ | ifs >> val; | ||
+ | h_data[i*(nColumns+2)+j] = val; | ||
+ | } | ||
+ | } | ||
+ | |||
+ | std::cout <<"Setting buffer"<<std::endl; | ||
+ | bool state = devMemSet(); | ||
+ | |||
+ | } | ||
+ | |||
+ | Poisson::Poisson(const size_t r, const size_t c, float* d) { | ||
+ | bufferSide = 0; | ||
+ | nRowsTotal = r; | ||
+ | nColumns = c; | ||
+ | try{ | ||
+ | h_data = new float[(r+2)*(c+2)]; | ||
+ | } | ||
+ | catch (...){ | ||
+ | throw std::runtime_error("Failed to Allocate Memory"); | ||
+ | } | ||
+ | std::memcpy(h_data,d,(r+2)*(c+2)*sizeof(float)); | ||
+ | devMemSet(); | ||
+ | } | ||
+ | |||
+ | Poisson::~Poisson(){ | ||
+ | delete [] h_data; | ||
+ | cudaFree(d_data); | ||
+ | cudaDeviceReset(); | ||
+ | } | ||
+ | |||
+ | bool Poisson::devMemSet(){ | ||
+ | |||
+ | /* create double buffer */ | ||
+ | cudaMalloc(&d_data, bufferSize * sizeof(float)); | ||
+ | |||
+ | if(d_data != nullptr){ | ||
+ | /* copy the initial information to the first buffer */ | ||
+ | cudaError_t state = cudaMemcpy((void*)d_data,(const void*)h_data, bufferSize * sizeof(float),cudaMemcpyHostToDevice); | ||
+ | if(state != cudaSuccess) | ||
+ | std::cerr << "ERROR on devMemSet at cudaMemcpy : " << cudaGetErrorString(state)<< std::endl; | ||
+ | } | ||
+ | return d_data; | ||
+ | } | ||
+ | |||
+ | float* Poisson::operator()(const size_t nIterations, const float wx, const float wy){ | ||
+ | |||
+ | /* calculate the grid, block, where block has 1024 threads total */ | ||
+ | unsigned int gridx = ((nRowsTotal+2)+blockx-1)/blockx; | ||
+ | unsigned int gridy = ((nRowsTotal+2)+blocky-1)/blocky; | ||
+ | |||
+ | /* create dim3 */ | ||
+ | dim3 dBlock= {blockx,blocky}; | ||
+ | dim3 dGrid = {gridx,gridy}; | ||
+ | |||
+ | /* generate shared memory map that will control ghost cell sharing */ | ||
+ | char* hmap = new char[(blockx+2)*(blocky+2)*3]; | ||
+ | int stride = 3; | ||
+ | for(int i = 0 ; i < (blockx+2);i++){ | ||
+ | for(int j = 0 ; j < (blocky+2);j++){ | ||
+ | char val = 0; | ||
+ | char x = 0; | ||
+ | char y = 0; | ||
+ | if(i==1){ | ||
+ | val = 1; | ||
+ | x=-1; | ||
+ | y=0; | ||
+ | } | ||
+ | if(j==1){ | ||
+ | val = 1; | ||
+ | x=0; | ||
+ | y=-1; | ||
+ | } | ||
+ | if(i==blockx) { | ||
+ | val = 1; | ||
+ | x=1; | ||
+ | y=0; | ||
+ | } | ||
+ | if(j==blocky){ | ||
+ | val = 1; | ||
+ | x=0; | ||
+ | y=1; | ||
+ | } | ||
+ | if(i==2 || j==2 || i==31 || j==31) | ||
+ | val = 2; | ||
+ | hmap[(i * (blockx+2) + j)*stride] = val; | ||
+ | hmap[(i * (blockx+2) + j)*stride+1] = x; | ||
+ | hmap[(i * (blockx+2) + j)*stride+2] = y; | ||
+ | } | ||
+ | } | ||
+ | /* transfer to device */ | ||
+ | char* dmap = nullptr; | ||
+ | cudaMalloc(&dmap,(blockx+2)*(blocky+2)*sizeof(char)*3); | ||
+ | cudaMemcpy(dmap,hmap,(blockx+2)*(blocky+2)*sizeof(char)*3,cudaMemcpyHostToDevice); | ||
+ | |||
+ | /* run iterations */ | ||
+ | update<<<dGrid,dBlock>>>(d_data,dmap,nColumns, nRowsTotal, wx, wy,nIterations); | ||
+ | |||
+ | /*DEBUG */ h_data[2*(nColumns+2)+2] = 100.0f; | ||
+ | /* output results from device to host */ | ||
+ | cudaError_t state = cudaMemcpy(h_data,d_data,(nColumns+2)*(nRowsTotal+2)*sizeof(float),cudaMemcpyDeviceToHost); | ||
+ | if(state != cudaSuccess) | ||
+ | std::cout << "ERROR on () when copying data back to host with : " << cudaGetErrorString(state)<< std::endl; | ||
+ | |||
+ | return h_data; | ||
+ | } | ||
+ | |||
+ | void Poisson::show(std::ostream& ofs) const{ | ||
+ | ofs << std::fixed << std::setprecision(1); | ||
+ | for (size_t j = 1; j <= nColumns ; j++) { | ||
+ | for (size_t i = 1 ; i <= nRowsTotal ; i++) | ||
+ | ofs << std::setw(8) << h_data[i * (nColumns+2) + j]<<","; | ||
+ | ofs << std::endl; | ||
+ | } | ||
+ | } | ||
+ | __global__ void update (float* data, char* dmap, int nCol, int nRow, const float wx, const float wy, unsigned int nIterations){ | ||
+ | size_t j = blockDim.x * blockIdx.x + threadIdx.x + 1; /* for x axis */ | ||
+ | size_t i = blockDim.y * blockIdx.y + threadIdx.y + 1; /* for y axis */ | ||
+ | size_t y = threadIdx.x+1; | ||
+ | size_t x = threadIdx.y+1; | ||
+ | |||
+ | const unsigned int bufferSize = (32+2)*(32+2); | ||
+ | __shared__ float localBuffer[ 2 * bufferSize ]; /* double local buffer with ghost cells */ | ||
+ | // __shared__ char lmap[bufferSize]; | ||
+ | |||
+ | unsigned int buffer = 0; | ||
+ | |||
+ | float prefetch = 0.0f; | ||
+ | /* copy information into first of the local buffer */ | ||
+ | localBuffer[x*(32+2)+y] = data[i*(nCol+2)+j]; | ||
+ | __syncthreads(); | ||
+ | |||
+ | const char lmap = dmap[(x*(32+2)+y)*3]; | ||
+ | const char addx = dmap[(x*(32+2)+y)*3+1]; | ||
+ | const char addy = dmap[(x*(32+2)+y)*3+2]; | ||
+ | |||
+ | /* prefetch */ | ||
+ | if(lmap) | ||
+ | prefetch = data[(i+addx)*(nCol+2)+j+addy] ; | ||
+ | |||
+ | /* run iterations */ | ||
+ | for (unsigned int n = 0 ; n < nIterations; n++){ | ||
+ | if(lmap) | ||
+ | localBuffer[buffer * bufferSize + (x+addx)*(32+2) + y+addy] = prefetch; | ||
+ | /* Calculate and store into the other buffer */ | ||
+ | float curr = localBuffer[buffer*bufferSize + x * (32+2)+ y]; | ||
+ | float dir1 = localBuffer[buffer*bufferSize + (x+1) * (32+2) +y]; | ||
+ | float dir2 = localBuffer[buffer*bufferSize + (x-1) * (32+2) +y]; | ||
+ | float dir3 = localBuffer[buffer*bufferSize + x * (32+2) + y + 1]; | ||
+ | float dir4 = localBuffer[buffer*bufferSize + x * (32+2) + y - 1]; | ||
+ | localBuffer[(1-buffer)*bufferSize + x*(32+2)+y] = curr + wx*(dir1+dir2-2.0f*curr) + wy*(dir3+dir4-2.0f*curr); | ||
+ | /* flip buffer */ | ||
+ | buffer = 1-buffer; | ||
+ | /* for threads in charge of edges, share and obtain ghost cells */ | ||
+ | if(lmap){ | ||
+ | /* Copy over edges to global memory to be shared with neighboring blocks */ | ||
+ | data[i*(nCol+2)+j] = localBuffer[buffer * bufferSize + x * (32+2) + y ]; | ||
+ | } | ||
+ | __syncthreads(); | ||
+ | if(lmap){ | ||
+ | /* Copy back buffers from global memory */ | ||
+ | prefetch = data[(i+addx)*(nCol+2)+j+addy] ; | ||
+ | } | ||
+ | } | ||
+ | |||
+ | /* copy the output back into global memory */ | ||
+ | data[i*(nCol+2)+j] = localBuffer[buffer * bufferSize + x * (32+2) + y ]; | ||
+ | __syncthreads(); | ||
+ | } | ||
+ | } | ||
+ | </source> | ||
+ | |} | ||
+ | {| class="wikitable mw-collapsible mw-collapsed" | ||
+ | ! For-loop Optimization - poissant-alt-ghost2.cuh | ||
+ | |- | ||
+ | | | ||
+ | <source> | ||
+ | /* | ||
+ | * Poisson Method using two arrays. | ||
+ | * Non-Ghost Cells Method | ||
+ | * Multiple PCIe Calls made, once per iteration | ||
+ | * by Tony Sim | ||
+ | */ | ||
+ | #ifndef POISSON_H | ||
+ | #define POISSON_H | ||
+ | #include <fstream> | ||
+ | #include <cuda_runtime.h> | ||
+ | |||
+ | namespace DPS{ | ||
+ | class Poisson { | ||
+ | unsigned int blockx; | ||
+ | unsigned int blocky; | ||
+ | unsigned int nRowsTotal; | ||
+ | unsigned int nColumns; | ||
+ | unsigned int bufferSize; | ||
+ | float* h_data; | ||
+ | float* d_data; | ||
+ | int bufferSide; | ||
+ | |||
+ | void bufferSwitch(){ bufferSide = 1 - bufferSide; }; | ||
+ | bool devMemSet(); | ||
+ | |||
+ | public: | ||
+ | Poisson() = delete; | ||
+ | Poisson(std::ifstream& ifs); | ||
+ | Poisson(const size_t r, const size_t c, float* d); | ||
+ | ~Poisson(); | ||
+ | float* operator()(const size_t iteration, const float wx, const float wy); | ||
+ | float* operator()(const size_t iteration){ | ||
+ | return operator()(iteration,0.1,0.1); | ||
+ | } | ||
+ | void show(std::ostream& ofs) const; | ||
+ | }; | ||
+ | __global__ void update (float* data, char* dmap, int nCol, int nRow, const float wx, const float wy, unsigned int nIterations); | ||
+ | } | ||
+ | #endif | ||
+ | |||
+ | </source> | ||
+ | |} | ||
+ | |||
+ | ==== Optimization Details ==== | ||
+ | ===== PCIe Version ===== | ||
+ | * Coalesced Memory - Large performance boost. | ||
+ | * Prefetch - this had minor to no effect on the performance. | ||
+ | |||
+ | ===== For-loop Version ===== | ||
+ | * Shared Memory - Small boost. Used technique called Ghost Cells where updated information is shared over global memory as needed to perform the next iteration. | ||
+ | * Prefetch - Small boost. Information are fetched first into register in the previous iteration to be copied in the current iteration prior to calculation. | ||
+ | * Coalesed Memory - Large boost. | ||
+ | * Logic change - To minimize the number of condition calls, a predefined map of instruction was created on the host based on the block dimension information. Using this information, the if statement had been cut down to almost 1/4, showing noticeable performance increase. | ||
+ | |||
+ | ==== Result ==== | ||
+ | [[File:optimized.png|frame|GPU highlights. Ghost Cell + Prefetch + Coaleased memory + logic change is slightly faster than simpler Prefetch+Coaleased memory that uses Global Memory]] | ||
+ | [[File:all.png|frame|Using GPU significantly improved Calculation Time over the CPU counterparts.]] |
Revision as of 02:03, 7 April 2019
GPU610/DPS915 | Student List | Group and Project Index | Student Resources | Glossary
Contents
[hide]Ghost Cells
Team Members
- Tony Sim, Issue Dumper
- Robert Dittrich, Issue Collector
- Inna Zhogova, Issue Resolver
Progress
Assignment 1
Tony
Subject: Jacobi's method for Poisson's equation
Source Code
[Expand] poissan.h |
---|
[Expand] poissan.cpp |
---|
[Expand] main.cpp |
---|
Introduction
The presented code simulates heat map using Jacobi's method for Poisson's equation. It is represented in a 2D array, and each element updates its value based on the adjacent elements at a given moment. Each iteration represent one instance in time. By repeating the calculation over the entire array through multiple iterations, we can estimate the state of the heat transfer after a given time interval.
Profiling
The profiling was conducted using a data set of 79 rows and 205 columns over 150000 iterations.
[Expand] Flat profile |
---|
[Expand] Call graph |
---|
Analysis
given 98.57 percent of time is spent on the update() function, it is considered the hotspot. Total time taken was 2.75.
If we consider a GPU environment with 1000 cores, we can estimate the following speedup: S1000 = 1/(1-.9857 + .9857/1000) = 65.00 In fact, the speed will decrease from 2.75 seconds to 0.0450 seconds.
As each iteration depends on the product of the previous iteration, there is a dependency resolution that might hamper the parallel process. Consideration may also be extended to resolving ghost cells across different SMX while using the device global memory as the transfer pipeline.
Robert
Multi Sampling Anti Aliasing
Source Files
[Expand] main.cpp |
---|
[Expand] vec3.h |
---|
Introduction
For my selection I chose to do Anti Aliasing since I see it a lot in video games but I never really knew how it worked. There are other anti aliasing methods like FXAA which is fast approximate anti aliasing but it seemed a lot more complicated than MSAA. The way I approached this problem is by getting the color of the pixels around a pixel. In you can specify the distance it will search in the application flags. In my implementation you specify an input file, output file, the radius of pixels to sample and how many passes to take on the image. In my tests the command line options I used was an image I made in paint with 4 sample size and 4 passes.
[Expand] Before |
---|
[Expand] After |
---|
Profiling
[Expand] Profiling |
---|
Conclusion
Since the msaa
function I wrote is a hotspot of the program I would suggest offloading part of it to a GPU, more specifically the part that finds the average of colors of the nearby pixels. That part also does not depend on previous iterations to finish so it is a prime candidate for parallelization.
Inna
Subject: Data compression - LWZ algorithm.
Source: http://www.cplusplus.com/articles/iL18T05o/#Version1
I tested the following source code for a compression and decompression of .txt files and a gif.
[Expand] lwz.cpp( ... ) |
---|
Tested data
1. book.txt - a 343 kilobyte text file.
2. words.txt - a 4.7 megabyte text file.
3. fire.gif - a 309 kilobyte graphical image.
Flat Profiles
Book
Flat profile for compression:
Flat profile for decompression:
Text
Flat profile for compression:
Flat profile for decompression:
GIF
Flat profile for compression:
Flat profile for decompression:
Assignment 2
Source Files
[Expand] poisson-pcie.cu |
---|
[Expand] poisson-alt.cu |
---|
Profiles
[Expand] Poisson PCIe Profile |
---|
[Expand] Poisson AltProfile |
---|
GPU Offload Vs CPU
Assignment 3
Source Codes
[Expand] PCIe Optimization |
---|
[Expand] For-loop Optimization |
---|
[Expand] For-loop Optimization - poissant-alt-ghost2.cuh |
---|
Optimization Details
PCIe Version
- Coalesced Memory - Large performance boost.
- Prefetch - this had minor to no effect on the performance.
For-loop Version
- Shared Memory - Small boost. Used technique called Ghost Cells where updated information is shared over global memory as needed to perform the next iteration.
- Prefetch - Small boost. Information are fetched first into register in the previous iteration to be copied in the current iteration prior to calculation.
- Coalesed Memory - Large boost.
- Logic change - To minimize the number of condition calls, a predefined map of instruction was created on the host based on the block dimension information. Using this information, the if statement had been cut down to almost 1/4, showing noticeable performance increase.