Open main menu

CDOT Wiki β

Changes

BETTERRED

5,384 bytes added, 12:57, 12 April 2017
Steps
=== Device to Host ===
After that is done the image is copied back using the streams in the same way it was copied over.
=== Code ===
 
{| class="wikitable mw-collapsible mw-collapsed"
! Unoptimized - BlurImage( ... )
|-
|
<syntaxhighlight lang="cpp">
const int ntpb = 1024;
const int STREAMS = 32;
 
void check(cudaError_t error) {
if (error != cudaSuccess) {
throw std::exception(cudaGetErrorString(error));
}
}
 
struct SImageData
{
SImageData()
: m_width(0)
, m_height(0)
{ }
 
long m_width;
long m_height;
long m_pitch;
std::vector<uint8_t> m_pixels;
};
 
float Gaussian(float sigma, float x)
{
return expf(-(x*x) / (2.0f * sigma*sigma));
}
 
float GaussianSimpsonIntegration(float sigma, float a, float b)
{
return
((b - a) / 6.0f) *
(Gaussian(sigma, a) + 4.0f * Gaussian(sigma, (a + b) / 2.0f) + Gaussian(sigma, b));
}
 
std::vector<float> GaussianKernelIntegrals(float sigma, int taps)
{
std::vector<float> ret;
float total = 0.0f;
for (int i = 0; i < taps; ++i)
{
float x = float(i) - float(taps / 2);
float value = GaussianSimpsonIntegration(sigma, x - 0.5f, x + 0.5f);
ret.push_back(value);
total += value;
}
// normalize it
for (unsigned int i = 0; i < ret.size(); ++i)
{
ret[i] /= total;
}
return ret;
}
 
struct BGRPixel {
float b;
float g;
float r;
};
 
__global__ void blur_kernel(BGRPixel* imageIn, BGRPixel* imageOut, float* blur, int n_blur, int x, int start, int jump) {
int idx = blockDim.x*blockIdx.x + threadIdx.x; // Location on the row
 
if (idx < x) {
int id = start + idx;
int bstart = id - (n_blur / 2)*jump;
 
BGRPixel pixel{ 0.0f, 0.0f, 0.0f };
 
for (int i = 0; i < n_blur; ++i) {
int bid = bstart + i*jump;
float iblur = blur[i];
 
pixel.b += imageIn[bid].b * iblur;
pixel.g += imageIn[bid].g * iblur;
pixel.r += imageIn[bid].r * iblur;
}
 
imageOut[id].b = pixel.b;
imageOut[id].g = pixel.g;
imageOut[id].r = pixel.r;
}
}
 
void BlurImage(const SImageData& srcImage, SImageData &destImage, float xblursigma, float yblursigma, unsigned int xblursize, unsigned int yblursize)
{
int xImage = srcImage.m_width; // Width of image
int yImage = srcImage.m_height; // Height of image
int imageSize = xImage*yImage;
 
int xPadded = xImage + (xblursize - 1); // Width including padding
int yPadded = yImage + (yblursize - 1); // Height including padding
int paddedSize = xPadded*yPadded;
 
int xPad = xblursize / 2; // Number of padding columns on each side
int yPad = yblursize / 2;
int padOffset = xPadded*yPad + xPad; // Offset to first pixel in padded image
 
float* pinnedImage = nullptr;
BGRPixel* d_padded1 = nullptr;
BGRPixel* d_padded2 = nullptr;
 
float* d_xblur = nullptr; // XBlur integrals
int n_xblur; // N
 
float* d_yblur = nullptr; // YBlur integrals
int n_yblur; // N
 
// Allocate memory for host and device
check(cudaHostAlloc((void**)&pinnedImage, 3 * imageSize * sizeof(float), 0));
check(cudaMalloc((void**)&d_padded1, paddedSize * sizeof(BGRPixel)));
check(cudaMalloc((void**)&d_padded2, paddedSize * sizeof(BGRPixel)));
 
// Copy image to pinned memory
for (int i = 0; i < 3 * imageSize; ++i) {
pinnedImage[i] = (float)srcImage.m_pixels[i];
}
 
// Allocate and assign intergrals
{
auto row_blur = GaussianKernelIntegrals(xblursigma, xblursize);
auto col_blur = GaussianKernelIntegrals(yblursigma, yblursize);
 
// ROW
n_xblur = row_blur.size();
check(cudaMalloc((void**)&d_xblur, n_xblur * sizeof(float)));
check(cudaMemcpy(d_xblur, row_blur.data(), n_xblur * sizeof(float), cudaMemcpyHostToDevice));
 
// COLUMN
n_yblur = col_blur.size();
check(cudaMalloc((void**)&d_yblur, n_yblur * sizeof(float)));
check(cudaMemcpy(d_yblur, col_blur.data(), n_yblur * sizeof(float), cudaMemcpyHostToDevice));
}
 
 
cudaStream_t stream[STREAMS];
 
int nblks = (xImage + (ntpb - 1)) / ntpb;
 
for (int i = 0; i < STREAMS; ++i) {
check(cudaStreamCreate(&stream[i]));
}
 
for (int i = 0; i < yImage;) {
for (int j = 0; j < STREAMS && i < yImage; ++j, ++i) {
cudaMemcpyAsync(d_padded1 + padOffset + i*xPadded, pinnedImage + (3 * i*xImage), 3 * xImage * sizeof(float), cudaMemcpyHostToDevice, stream[j]);
}
}
 
for (int i = 0; i < yImage;) {
for (int j = 0; j < STREAMS && i < yImage; ++j, ++i) {
blur_kernel << <nblks, ntpb, 0, stream[j] >> > (d_padded1, d_padded2, d_xblur, n_xblur, xImage, padOffset + i*xPadded, 1);
}
}
 
for (int i = 0; i < yImage;) {
for (int j = 0; j < STREAMS && i < yImage; ++j, ++i) {
blur_kernel << <nblks, ntpb, 0, stream[j] >> > (d_padded2, d_padded1, d_yblur, n_yblur, xImage, padOffset + i*xPadded, xPadded);
}
}
 
for (int i = 0; i < yImage;) {
for (int j = 0; j < STREAMS && i < yImage; ++j, ++i) {
check(cudaMemcpyAsync(pinnedImage + (3 * i*xImage), d_padded1 + padOffset + i*xPadded, xImage * sizeof(BGRPixel), cudaMemcpyDeviceToHost, stream[j]));
}
}
 
for (int i = 0; i < STREAMS; ++i) {
check(cudaStreamSynchronize(stream[i]));
check(cudaStreamDestroy(stream[i]));
}
 
destImage.m_width = srcImage.m_width;
destImage.m_height = srcImage.m_height;
destImage.m_pitch = srcImage.m_pitch;
destImage.m_pixels.resize(srcImage.m_pixels.size());
 
for (int i = 0; i < 3 * imageSize; i++) {
destImage.m_pixels[i] = (uint8_t)pinnedImage[i];
};
 
check(cudaFree(d_xblur));
check(cudaFree(d_yblur));
 
check(cudaFreeHost(pinnedImage));
check(cudaFree(d_padded1));
check(cudaFree(d_padded2));
 
check(cudaDeviceReset());
}
 
</syntaxhighlight>
 
|}
== Results ==
49
edits