Changes

BETTERRED

5,384 bytes added, 12:57, 12 April 2017

→‎Steps

=== Device to Host ===

After that is done the image is copied back using the streams in the same way it was copied over.

=== Code ===

{| class="wikitable mw-collapsible mw-collapsed"

! Unoptimized - BlurImage( ... )

|-

|

const int ntpb = 1024;

const int STREAMS = 32;

void check(cudaError_t error) {

if (error != cudaSuccess) {

throw std::exception(cudaGetErrorString(error));

}

struct SImageData

{

SImageData()

: m_width(0)

, m_height(0)

{ }

long m_width;

long m_height;

long m_pitch;

std::vector<uint8_t> m_pixels;

};

float Gaussian(float sigma, float x)

{

return expf(-(x*x) / (2.0f * sigma*sigma));

}

float GaussianSimpsonIntegration(float sigma, float a, float b)

{

return

((b - a) / 6.0f) *

(Gaussian(sigma, a) + 4.0f * Gaussian(sigma, (a + b) / 2.0f) + Gaussian(sigma, b));

}

std::vector<float> GaussianKernelIntegrals(float sigma, int taps)

{

std::vector<float> ret;

float total = 0.0f;

for (int i = 0; i < taps; ++i)

{

float x = float(i) - float(taps / 2);

float value = GaussianSimpsonIntegration(sigma, x - 0.5f, x + 0.5f);

ret.push_back(value);

total += value;

}

// normalize it

for (unsigned int i = 0; i < ret.size(); ++i)

{

ret[i] /= total;

}

return ret;

}

struct BGRPixel {

float b;

float g;

float r;

};

__global__ void blur_kernel(BGRPixel* imageIn, BGRPixel* imageOut, float* blur, int n_blur, int x, int start, int jump) {

int idx = blockDim.x*blockIdx.x + threadIdx.x; // Location on the row

if (idx < x) {

int id = start + idx;

int bstart = id - (n_blur / 2)*jump;

BGRPixel pixel{ 0.0f, 0.0f, 0.0f };

for (int i = 0; i < n_blur; ++i) {

int bid = bstart + i*jump;

float iblur = blur[i];

pixel.b += imageIn[bid].b * iblur;

pixel.g += imageIn[bid].g * iblur;

pixel.r += imageIn[bid].r * iblur;

}

imageOut[id].b = pixel.b;

imageOut[id].g = pixel.g;

imageOut[id].r = pixel.r;

}

void BlurImage(const SImageData& srcImage, SImageData &destImage, float xblursigma, float yblursigma, unsigned int xblursize, unsigned int yblursize)

{

int xImage = srcImage.m_width; // Width of image

int yImage = srcImage.m_height; // Height of image

int imageSize = xImage*yImage;

int xPadded = xImage + (xblursize - 1); // Width including padding

int yPadded = yImage + (yblursize - 1); // Height including padding

int paddedSize = xPadded*yPadded;

int xPad = xblursize / 2; // Number of padding columns on each side

int yPad = yblursize / 2;

int padOffset = xPadded*yPad + xPad; // Offset to first pixel in padded image

float* pinnedImage = nullptr;

BGRPixel* d_padded1 = nullptr;

BGRPixel* d_padded2 = nullptr;

float* d_xblur = nullptr; // XBlur integrals

int n_xblur; // N

float* d_yblur = nullptr; // YBlur integrals

int n_yblur; // N

// Allocate memory for host and device

check(cudaHostAlloc((void**)&pinnedImage, 3 * imageSize * sizeof(float), 0));

check(cudaMalloc((void**)&d_padded1, paddedSize * sizeof(BGRPixel)));

check(cudaMalloc((void**)&d_padded2, paddedSize * sizeof(BGRPixel)));

// Copy image to pinned memory

for (int i = 0; i < 3 * imageSize; ++i) {

pinnedImage[i] = (float)srcImage.m_pixels[i];

}

// Allocate and assign intergrals

{

auto row_blur = GaussianKernelIntegrals(xblursigma, xblursize);

auto col_blur = GaussianKernelIntegrals(yblursigma, yblursize);

// ROW

n_xblur = row_blur.size();

check(cudaMalloc((void**)&d_xblur, n_xblur * sizeof(float)));

check(cudaMemcpy(d_xblur, row_blur.data(), n_xblur * sizeof(float), cudaMemcpyHostToDevice));

// COLUMN

n_yblur = col_blur.size();

check(cudaMalloc((void**)&d_yblur, n_yblur * sizeof(float)));

check(cudaMemcpy(d_yblur, col_blur.data(), n_yblur * sizeof(float), cudaMemcpyHostToDevice));

}

cudaStream_t stream[STREAMS];

int nblks = (xImage + (ntpb - 1)) / ntpb;

for (int i = 0; i < STREAMS; ++i) {

check(cudaStreamCreate(&stream[i]));

}

for (int i = 0; i < yImage;) {

for (int j = 0; j < STREAMS && i < yImage; ++j, ++i) {

cudaMemcpyAsync(d_padded1 + padOffset + i*xPadded, pinnedImage + (3 * i*xImage), 3 * xImage * sizeof(float), cudaMemcpyHostToDevice, stream[j]);

}

for (int i = 0; i < yImage;) {

for (int j = 0; j < STREAMS && i < yImage; ++j, ++i) {

blur_kernel << <nblks, ntpb, 0, stream[j] >> > (d_padded1, d_padded2, d_xblur, n_xblur, xImage, padOffset + i*xPadded, 1);

}

for (int i = 0; i < yImage;) {

for (int j = 0; j < STREAMS && i < yImage; ++j, ++i) {

blur_kernel << <nblks, ntpb, 0, stream[j] >> > (d_padded2, d_padded1, d_yblur, n_yblur, xImage, padOffset + i*xPadded, xPadded);

}

for (int i = 0; i < yImage;) {

for (int j = 0; j < STREAMS && i < yImage; ++j, ++i) {

check(cudaMemcpyAsync(pinnedImage + (3 * i*xImage), d_padded1 + padOffset + i*xPadded, xImage * sizeof(BGRPixel), cudaMemcpyDeviceToHost, stream[j]));

}

for (int i = 0; i < STREAMS; ++i) {

check(cudaStreamSynchronize(stream[i]));

check(cudaStreamDestroy(stream[i]));

}

destImage.m_width = srcImage.m_width;

destImage.m_height = srcImage.m_height;

destImage.m_pitch = srcImage.m_pitch;

destImage.m_pixels.resize(srcImage.m_pixels.size());

for (int i = 0; i < 3 * imageSize; i++) {

destImage.m_pixels[i] = (uint8_t)pinnedImage[i];

};

check(cudaFree(d_xblur));

check(cudaFree(d_yblur));

check(cudaFreeHost(pinnedImage));

check(cudaFree(d_padded1));

check(cudaFree(d_padded2));

check(cudaDeviceReset());

}

</syntaxhighlight>

|}

== Results ==

Jkraitberg

49

edits

CDOT Wiki β

Changes

BETTERRED

CDOT Wiki ^β