Changes

BETTERRED

1,381 bytes removed, 23:31, 11 April 2017

→‎Assignment 2 - Parallelize

{| class="wikitable mw-collapsible mw-collapsed"

! ~~Culptit~~ Unoptimized - BlurImage( ... )

|-

|

#include <windows.h> // for bitmap headers.

#include <algorithm>

#include <chrono>

#include <cuda_runtime.h>

#include <device_functions.h>

//#~~if defined(__NVCC__) && __CUDACC_VER_MAJOR__ != 1const int ntpb = 1024;~~ifdef __CUDACC__//#~~elif defined(__NVCC__) &&~~if __CUDACC_VER_MAJOR__ == 1

//const int ntpb = 512;

//#else

//const int ntpb = 1024;

//#endif

//#endifconst ~~float c_pi~~ int ntpb = ~~3.14159265359f~~1024;const int STREAMS = 32;

void check(cudaError_t error) {

}

~~const uint8_t* GetPixelOrBlack(const SImageData& image, int x, int y)~~struct BGRPixel { ~~static const uint8_t black[3] = { 0, 0, 0 }~~float b; ~~if (x < 0 || x >= image.m_width ||~~ ~~y < 0 || y >= image.m_height)~~float g; { ~~return black~~float r; } ~~return &image.m_pixels[(y * image.m_pitch) + x * 3]~~;}

__global__ void ~~horizontal_blur_kernel~~blur_kernel(~~float~~BGRPixel* ~~pixels~~imageIn, ~~float~~BGRPixel* ~~output~~imageOut, float* ~~intergrals~~blur, int ~~nIntegrals~~n_blur, int ~~width~~x, int ~~height~~start, int ~~pitch~~jump) { //int p idx = ~~pitch;~~ ~~//int x = width;~~ ~~//int y = height;~~ ~~//int n = nIntegrals;~~ ~~int idy = blockIdx~~blockDim.x*~~blockDim~~blockIdx.x + threadIdx.x; ~~int idx = blockIdx.y*blockDim.y + threadIdx.y;~~ //~~int startOffset = -1 * int(nIntegrals / 2);~~Location on the row

~~//float* dst;~~if (idx < x) { ~~//const float* pixel~~ int id = start + idx; ~~//const float black[3]~~ int bstart = ~~{ 0.0f, 0.0f, 0.0f };~~ id - (n_blur /~~/float blurred_pixel[3] = { 0.0f, 0.0f, 0.0f }~~2)*jump;

~~//for (int i =~~ BGRPixel pixel{ 0~~; i < n; ++i) { // Prefetch for integrals and pixels~~ ~~// int ty = y + startOffset + i;~~ ~~// pixel = (idx <~~ .0f, 0 ~~|| idx >= x ||~~ ~~// idy <~~ .0f, 0 ~~|| idy >= ty) ? black : &pixels[(ty * p) + idx * 3]~~.0f };

~~// blurred_pixel[0] +~~ for (int i = ~~pixel[~~0~~] * intergrals[~~; i]< n_blur;++i) { ~~// blurred_pixel[1]~~ int bid = bstart +~~= pixel[1]~~ i* ~~intergrals[i]~~jump; ~~// blurred_pixel[2] +~~ float iblur = ~~pixel[2] * intergrals~~blur[i]; ~~//}~~

~~//dst~~ pixel.b += ~~&output~~imageIn[~~idy~~bid].b *~~p + idx * 3]~~iblur; ~~//dst[0]~~ pixel.g += ~~blurred_pixel~~imageIn[0bid].g * iblur; ~~//dst[1]~~ pixel.r += ~~blurred_pixel~~imageIn[1bid].r * iblur; ~~//dst[2] = blurred_pixel[2];~~ }

~~//if (idx == 0) {~~ ~~output~~imageOut[~~idx*width + idy~~id] .b = ~~pixels[idx*width + idy]~~pixel.b; ~~//if (idx % 3~~ imageOut[id].g =~~= 0) {~~pixel.g; ~~// output~~imageOut[~~idx + idy*width~~id] .r = 0pixel.r; ~~//}~~ //}

}

void BlurImage(const SImageData& srcImage, SImageData &destImage, float xblursigma, float yblursigma, unsigned int xblursize, unsigned int yblursize)

{

~~float* d_ipixels~~int xImage = srcImage.m_width; // ~~Device input pixel array~~ Width of image ~~float* d_opixels~~int yImage = srcImage.m_height; // ~~Device output pixel array~~Height of image ~~float~~int imageSize = xImage* ~~d_integrals~~yImage; ~~// Stores guassian kernel intergrals~~

int n xPadded = ~~srcImage.m_height*srcImage.m_pitch~~xImage + (xblursize - 1); // Width including padding int ~~nblks~~ yPadded = yImage + (~~n + ntpb~~ yblursize - 1) ; // ~~ntpb~~Height including padding int paddedSize = xPadded*yPadded;

~~dim3 dimBlock(1, 3)~~int xPad = xblursize / 2; // Number of padding columns on each side ~~dim3 dimGrid(srcImage.m_width~~int yPad = yblursize / 2; int padOffset = xPadded*~~3, srcImage.m_height)~~yPad + xPad; // Offset to first pixel in padded image

~~check(cudaMalloc((void~~float*pinnedImage = nullptr; BGRPixel*~~)&d_ipixels, srcImage.m_pitch*srcImage.m_height * sizeof(float)))~~d_padded1 = nullptr; ~~check(cudaMalloc((void~~BGRPixel**)&d_opixels, srcImage.m_pitch*srcImage.m_height*sizeof(float)))d_padded2 = nullptr;

{ ~~std::vector<~~float~~> temp(srcImage.m_pixels.size())~~* d_xblur = nullptr; ~~std::transform(srcImage.m_pixels.begin(), srcImage.m_pixels.end(), temp.begin(), [](auto e) {~~ ~~return e~~ // ~~255.0f;~~ XBlur integrals }) int n_xblur; // N

~~check(cudaMemcpy(d_ipixels, temp.data(), 3 * srcImage.m_width~~ float*~~srcImage.m_height * sizeof(float), cudaMemcpyHostToDevice))~~d_yblur = nullptr; // YBlur integrals }int n_yblur; // N

// ~~horizontal blur from d_ipixels to d_opixels~~Allocate memory for host and device { check(cudaHostAlloc((void**)&pinnedImage, 3 * imageSize * sizeof(float), 0)); ~~int nIntegrals~~ check(cudaMalloc((void**)&d_padded1, paddedSize * sizeof(BGRPixel))); check(cudaMalloc((void**)&d_padded2, paddedSize * sizeof(BGRPixel)));

// ~~Scoped so that the row is cleared once it's copied~~Copy image to pinned memory for (int i = 0; i < 3 * imageSize; ++i) { ~~auto row~~ pinnedImage[i] = ~~GaussianKernelIntegrals~~(~~xblursigma, xblursize~~float)srcImage.m_pixels[i]; ~~nIntegrals = row.size();~~ }

~~check(cudaMalloc~~ // Allocate and assign intergrals { auto row_blur = GaussianKernelIntegrals(~~(void**)&d_integrals~~xblursigma, ~~row.size() * sizeof(float))~~xblursize); ~~check(cudaMemcpy(d_integrals, row.data(), row.size() * sizeof~~ auto col_blur = GaussianKernelIntegrals(~~float)~~yblursigma, ~~cudaMemcpyHostToDevice)~~yblursize); }

~~horizontal_blur_kernel <<<dimGrid~~// ROW n_xblur = row_blur.size(); check(cudaMalloc((void**)&d_xblur, ~~dimBlock >>>~~ n_xblur * sizeof(float))); check(cudaMemcpy(~~d_ipixels, d_opixels, d_integrals, nIntegrals~~d_xblur, ~~srcImage~~row_blur.~~m_width~~data(), ~~srcImage.m_height~~n_xblur * sizeof(float), ~~srcImage.m_pitch~~cudaMemcpyHostToDevice));

~~cudaDeviceSynchronize~~// COLUMN n_yblur = col_blur.size(); check(~~cudaGetLastError~~cudaMalloc((void**)&d_yblur, n_yblur * sizeof(float))); check(~~cudaFree~~cudaMemcpy(~~d_integrals~~d_yblur, col_blur.data(), n_yblur * sizeof(float), cudaMemcpyHostToDevice));

}

~~destImage.m_width = srcImage.m_width;~~

~~destImage.m_height = srcImage.m_height;~~

~~destImage.m_pitch = srcImage.m_pitch;~~

~~destImage.m_pixels.resize(destImage.m_height * destImage.m_pitch);~~

{ ~~std::vector<float> temp(srcImage.m_pixels.size());~~ ~~check(cudaMemcpy(temp.data(), d_opixels, 3 * srcImage.m_width*srcImage.m_height * sizeof(float), cudaMemcpyDeviceToHost));~~ ~~std::transform(temp.begin(), temp.end(), destImage.m_pixels.begin(),~~ cudaStream_t stream[STREAMS]~~(auto e) {~~ ~~return (int)(e * 255.0f);~~ }); }

~~check~~int nblks = (~~cudaFree(d_ipixels));~~ ~~check(cudaFree(d_opixels));~~ ~~check(cudaDeviceReset~~xImage + (ntpb - 1))/ ntpb;

for (int i = 0; i < STREAMS; ++i) {

check(cudaStreamCreate(&stream[i]));

}

~~//// allocate space~~ for ~~copying the image for destImage and tmpImage~~ ~~//destImage.m_width~~ (int i = ~~srcImage.m_width~~0;i < yImage;) { ~~//destImage.m_height~~ for (int j = ~~srcImage.m_height~~0; ~~//destImage.m_pitch = srcImage.m_pitch~~j < STREAMS && i < yImage;++j, ++i) { ~~//destImage.m_pixels.resize~~ cudaMemcpyAsync(d_padded1 + padOffset + i*xPadded, pinnedImage + (~~destImage.m_height~~ 3 * i*xImage), 3 * xImage * ~~destImage.m_pitch~~sizeof(float), cudaMemcpyHostToDevice, stream[j]); } }

~~//SImageData tmpImage~~for (int i = 0; ~~//tmpImage.m_width = srcImage.m_width~~i < yImage;) { ~~//tmpImage.m_height~~ for (int j = ~~srcImage.m_height~~0; ~~//tmpImage.m_pitch = srcImage.m_pitch~~j < STREAMS && i < yImage;++j, ++i) { ~~//tmpImage.m_pixels.resize~~ blur_kernel << <nblks, ntpb, 0, stream[j] >> > (~~tmpImage.m_height~~ d_padded1, d_padded2, d_xblur, n_xblur, xImage, padOffset + i* ~~tmpImage.m_pitch~~xPadded, 1); } }

~~//// horizontal blur from srcImage into tmpImage~~for (int i = 0; i < yImage;) { // for (int j = 0; j < STREAMS && i < yImage; ++j, ++i) { ~~// auto row = GaussianKernelIntegrals~~ blur_kernel << <nblks, ntpb, 0, stream[j] >> > (~~xblursigma~~d_padded2, ~~xblursize~~d_padded1, d_yblur, n_yblur, xImage, padOffset + i*xPadded, xPadded); } }

// for (int ~~startOffset~~ i = -1 * 0; i < yImage;) { for (intj = 0; j < STREAMS && i < yImage; ++j, ++i) { check(~~row.size~~cudaMemcpyAsync(pinnedImage + (3 * i*xImage), d_padded1 + padOffset + i*xPadded, xImage * sizeof(BGRPixel), cudaMemcpyDeviceToHost, stream[j]) ~~/ 2~~); } }

// for (~~int y = 0; y < tmpImage.m_height; ++y)~~ ~~// {~~ ~~// for (int x = 0; x < tmpImage.m_width; ++x)~~ ~~// {~~ ~~// std::array<float, 3> blurredPixel = { { 0.0f, 0.0f, 0.0f } };~~ ~~// for (unsigned~~ int i = 0; i < ~~row.size()~~STREAMS; ++i){ // { ~~// const uint8_t *pixel = GetPixelOrBlack~~check(~~srcImage, x + startOffset + i, y);~~ ~~// blurredPixel[0] += float~~cudaStreamSynchronize(~~pixel~~stream[0i]) * row[i]); ~~// blurredPixel[1] += float~~ check(cudaStreamDestroy(~~pixel~~stream[1i]) * row[i]; ~~// blurredPixel[2] += float(pixel[2]~~) * row[i]; // }

~~// uint8_t *destPixel~~ destImage.m_width = srcImage.m_width; destImage.m_height = srcImage.m_height; destImage.m_pitch = ~~&tmpImage~~srcImage.m_pitch; destImage.m_pixels~~[y * tmpImage~~.~~m_pitch + x * 3]~~resize(srcImage.m_pixels.size());

~~// destPixel[0]~~ for (int i = ~~uint8_t(blurredPixel[~~0]; i < 3 * imageSize; i++);{ // ~~destPixel~~destImage.m_pixels[1i] = (uint8_t~~(blurredPixel[1]~~); ~~// destPixel~~pinnedImage[2i] ~~= uint8_t(blurredPixel[2])~~; // } ~~// }~~ ~~//}~~;

~~//// vertical blur from tmpImage into destImage~~check(cudaFree(d_xblur)); ~~//{~~ ~~// auto row = GaussianKernelIntegrals~~check(cudaFree(~~yblursigma, yblursize~~d_yblur));

~~// int startOffset = -1 * int~~check(~~row.size~~cudaFreeHost(pinnedImage)); check(cudaFree(d_padded1)); check(cudaFree(d_padded2) ~~/ 2~~);

~~// for~~ check(~~int y = 0; y < destImage.m_height; ++y)~~ ~~// {~~ ~~// for (int x = 0; x < destImage.m_width; ++x)~~ ~~// {~~ ~~// std::array<float, 3> blurredPixel = { { 0.0f, 0.0f, 0.0f } };~~ ~~// for (unsigned int i = 0; i < row.size(); ++i)~~ ~~// {~~ ~~// const uint8_t *pixel = GetPixelOrBlack(tmpImage, x, y + startOffset + i);~~ ~~// blurredPixel[0] += float(pixel[0]) * row[i];~~ ~~// blurredPixel[1] += float(pixel[1]) * row[i];~~ ~~// blurredPixel[2] += float(pixel[2]) * row[i];~~ ~~// }~~ ~~// uint8_t *destPixel = &destImage.m_pixels[y * destImage.m_pitch + x * 3];~~ ~~// destPixel[0] = uint8_t(blurredPixel[0]);~~ ~~// destPixel[1] = uint8_t~~cudaDeviceReset(~~blurredPixel[1]~~); ~~// destPixel[2] = uint8_t(blurredPixel[2]~~); ~~// }~~ ~~// }~~ ~~//}~~

}

printf("%s loaded\n", srcFileName);

SImageData destImage;

auto t1 = std::chrono::high_resolution_clock::now();

BlurImage(srcImage, destImage, xblursigma, yblursigma, xblursize, yblursize);

auto t2 = std::chrono::high_resolution_clock::now();

std::cout << "BlurImage time: " << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << "us" << std::endl;

if (SaveImage(destFileName, destImage))

printf("Blurred image saved as %s\n", destFileName);

Jkraitberg

49

edits

Changes

BETTERRED

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools