Changes

Jump to: navigation, search

BETTERRED

1,381 bytes removed, 23:31, 11 April 2017
Assignment 2 - Parallelize
{| class="wikitable mw-collapsible mw-collapsed"
! Culptit Unoptimized - BlurImage( ... )
|-
|
#include <windows.h> // for bitmap headers.
#include <algorithm>
#include <chrono>
#include <cuda_runtime.h>
#include <device_functions.h>
//#if defined(__NVCC__) && __CUDACC_VER_MAJOR__ != 1const int ntpb = 1024;ifdef __CUDACC__//#elif defined(__NVCC__) &&if __CUDACC_VER_MAJOR__ == 1
//const int ntpb = 512;
//#else
//const int ntpb = 1024;
//#endif
//#endifconst float c_pi int ntpb = 3.14159265359f1024;const int STREAMS = 32;
void check(cudaError_t error) {
}
const uint8_t* GetPixelOrBlack(const SImageData& image, int x, int y)struct BGRPixel { static const uint8_t black[3] = { 0, 0, 0 }float b; if (x < 0 || x >= image.m_width || y < 0 || y >= image.m_height)float g; { return blackfloat r; return &image.m_pixels[(y * image.m_pitch) + x * 3];}
__global__ void horizontal_blur_kernelblur_kernel(floatBGRPixel* pixelsimageIn, floatBGRPixel* outputimageOut, float* intergralsblur, int nIntegralsn_blur, int widthx, int heightstart, int pitchjump) { //int p idx = pitch; //int x = width; //int y = height; //int n = nIntegrals; int idy = blockIdxblockDim.x*blockDimblockIdx.x + threadIdx.x; int idx = blockIdx.y*blockDim.y + threadIdx.y; //int startOffset = -1 * int(nIntegrals / 2);Location on the row
//float* dst;if (idx < x) { //const float* pixel int id = start + idx; //const float black[3] int bstart = { 0.0f, 0.0f, 0.0f }; id - (n_blur //float blurred_pixel[3] = { 0.0f, 0.0f, 0.0f }2)*jump;
//for (int i = BGRPixel pixel{ 0; i < n; ++i) { // Prefetch for integrals and pixels // int ty = y + startOffset + i; // pixel = (idx < .0f, 0 || idx >= x || // idy < .0f, 0 || idy >= ty) ? black : &pixels[(ty * p) + idx * 3].0f };
// blurred_pixel[0] + for (int i = pixel[0] * intergrals[; i]< n_blur;++i) { // blurred_pixel[1] int bid = bstart += pixel[1] i* intergrals[i]jump; // blurred_pixel[2] + float iblur = pixel[2] * intergralsblur[i]; //}
//dst pixel.b += &outputimageIn[idybid].b *p + idx * 3]iblur; //dst[0] pixel.g += blurred_pixelimageIn[0bid].g * iblur; //dst[1] pixel.r += blurred_pixelimageIn[1bid].r * iblur; //dst[2] = blurred_pixel[2]; }
//if (idx == 0) { outputimageOut[idx*width + idyid] .b = pixels[idx*width + idy]pixel.b//if (idx % 3 imageOut[id].g == 0) {pixel.g; // outputimageOut[idx + idy*widthid] .r = 0pixel.r; //} //}
}
void BlurImage(const SImageData& srcImage, SImageData &destImage, float xblursigma, float yblursigma, unsigned int xblursize, unsigned int yblursize)
{
float* d_ipixelsint xImage = srcImage.m_width; // Device input pixel array Width of image float* d_opixelsint yImage = srcImage.m_height; // Device output pixel arrayHeight of image floatint imageSize = xImage* d_integralsyImage; // Stores guassian kernel intergrals
int n xPadded = srcImage.m_height*srcImage.m_pitchxImage + (xblursize - 1); // Width including padding int nblks yPadded = yImage + (n + ntpb yblursize - 1) ; // ntpbHeight including padding int paddedSize = xPadded*yPadded;
dim3 dimBlock(1, 3)int xPad = xblursize / 2; // Number of padding columns on each side dim3 dimGrid(srcImage.m_widthint yPad = yblursize / 2; int padOffset = xPadded*3, srcImage.m_height)yPad + xPad; // Offset to first pixel in padded image
check(cudaMalloc((voidfloat*pinnedImage = nullptr; BGRPixel*)&d_ipixels, srcImage.m_pitch*srcImage.m_height * sizeof(float)))d_padded1 = nullptr; check(cudaMalloc((voidBGRPixel**)&d_opixels, srcImage.m_pitch*srcImage.m_height*sizeof(float)))d_padded2 = nullptr;
{ std::vector<float> temp(srcImage.m_pixels.size())* d_xblur = nullptr; std::transform(srcImage.m_pixels.begin(), srcImage.m_pixels.end(), temp.begin(), [](auto e) { return e // 255.0f; XBlur integrals }) int n_xblur; // N
check(cudaMemcpy(d_ipixels, temp.data(), 3 * srcImage.m_width float*srcImage.m_height * sizeof(float), cudaMemcpyHostToDevice))d_yblur = nullptr; // YBlur integrals }int n_yblur; // N
// horizontal blur from d_ipixels to d_opixelsAllocate memory for host and device { check(cudaHostAlloc((void**)&pinnedImage, 3 * imageSize * sizeof(float), 0)); int nIntegrals check(cudaMalloc((void**)&d_padded1, paddedSize * sizeof(BGRPixel))); check(cudaMalloc((void**)&d_padded2, paddedSize * sizeof(BGRPixel)));
// Scoped so that the row is cleared once it's copiedCopy image to pinned memory for (int i = 0; i < 3 * imageSize; ++i) { auto row pinnedImage[i] = GaussianKernelIntegrals(xblursigma, xblursizefloat)srcImage.m_pixels[i]; nIntegrals = row.size(); }
check(cudaMalloc // Allocate and assign intergrals { auto row_blur = GaussianKernelIntegrals((void**)&d_integralsxblursigma, row.size() * sizeof(float))xblursize); check(cudaMemcpy(d_integrals, row.data(), row.size() * sizeof auto col_blur = GaussianKernelIntegrals(float)yblursigma, cudaMemcpyHostToDevice)yblursize); }
horizontal_blur_kernel <<<dimGrid// ROW n_xblur = row_blur.size(); check(cudaMalloc((void**)&d_xblur, dimBlock >>> n_xblur * sizeof(float))); check(cudaMemcpy(d_ipixels, d_opixels, d_integrals, nIntegralsd_xblur, srcImagerow_blur.m_widthdata(), srcImage.m_heightn_xblur * sizeof(float), srcImage.m_pitchcudaMemcpyHostToDevice));
cudaDeviceSynchronize// COLUMN n_yblur = col_blur.size(); check(cudaGetLastErrorcudaMalloc((void**)&d_yblur, n_yblur * sizeof(float)));  check(cudaFreecudaMemcpy(d_integralsd_yblur, col_blur.data(), n_yblur * sizeof(float), cudaMemcpyHostToDevice));
}
destImage.m_width = srcImage.m_width;
destImage.m_height = srcImage.m_height;
destImage.m_pitch = srcImage.m_pitch;
destImage.m_pixels.resize(destImage.m_height * destImage.m_pitch);
{ std::vector<float> temp(srcImage.m_pixels.size()); check(cudaMemcpy(temp.data(), d_opixels, 3 * srcImage.m_width*srcImage.m_height * sizeof(float), cudaMemcpyDeviceToHost)); std::transform(temp.begin(), temp.end(), destImage.m_pixels.begin(), cudaStream_t stream[STREAMS](auto e) { return (int)(e * 255.0f); }); }
checkint nblks = (cudaFree(d_ipixels)); check(cudaFree(d_opixels)); check(cudaDeviceResetxImage + (ntpb - 1))/ ntpb;
for (int i = 0; i < STREAMS; ++i) {
check(cudaStreamCreate(&stream[i]));
}
//// allocate space for copying the image for destImage and tmpImage //destImage.m_width (int i = srcImage.m_width0;i < yImage;) { //destImage.m_height for (int j = srcImage.m_height0; //destImage.m_pitch = srcImage.m_pitchj < STREAMS && i < yImage;++j, ++i) { //destImage.m_pixels.resize cudaMemcpyAsync(d_padded1 + padOffset + i*xPadded, pinnedImage + (destImage.m_height 3 * i*xImage), 3 * xImage * destImage.m_pitchsizeof(float), cudaMemcpyHostToDevice, stream[j]); } }
//SImageData tmpImagefor (int i = 0; //tmpImage.m_width = srcImage.m_widthi < yImage;) { //tmpImage.m_height for (int j = srcImage.m_height0; //tmpImage.m_pitch = srcImage.m_pitchj < STREAMS && i < yImage;++j, ++i) { //tmpImage.m_pixels.resize blur_kernel << <nblks, ntpb, 0, stream[j] >> > (tmpImage.m_height d_padded1, d_padded2, d_xblur, n_xblur, xImage, padOffset + i* tmpImage.m_pitchxPadded, 1); } }
//// horizontal blur from srcImage into tmpImagefor (int i = 0; i < yImage;) { // for (int j = 0; j < STREAMS && i < yImage; ++j, ++i) { // auto row = GaussianKernelIntegrals blur_kernel << <nblks, ntpb, 0, stream[j] >> > (xblursigmad_padded2, xblursized_padded1, d_yblur, n_yblur, xImage, padOffset + i*xPadded, xPadded); } }
// for (int startOffset i = -1 * 0; i < yImage;) { for (intj = 0; j < STREAMS && i < yImage; ++j, ++i) { check(row.sizecudaMemcpyAsync(pinnedImage + (3 * i*xImage), d_padded1 + padOffset + i*xPadded, xImage * sizeof(BGRPixel), cudaMemcpyDeviceToHost, stream[j]) / 2); } }
// for (int y = 0; y < tmpImage.m_height; ++y) // { // for (int x = 0; x < tmpImage.m_width; ++x) // { // std::array<float, 3> blurredPixel = { { 0.0f, 0.0f, 0.0f } }; // for (unsigned int i = 0; i < row.size()STREAMS; ++i){ // { // const uint8_t *pixel = GetPixelOrBlackcheck(srcImage, x + startOffset + i, y); // blurredPixel[0] += floatcudaStreamSynchronize(pixelstream[0i]) * row[i]); // blurredPixel[1] += float check(cudaStreamDestroy(pixelstream[1i]) * row[i]; // blurredPixel[2] += float(pixel[2]) * row[i]; // }
// uint8_t *destPixel destImage.m_width = srcImage.m_width; destImage.m_height = srcImage.m_height; destImage.m_pitch = &tmpImagesrcImage.m_pitch; destImage.m_pixels[y * tmpImage.m_pitch + x * 3]resize(srcImage.m_pixels.size());
// destPixel[0] for (int i = uint8_t(blurredPixel[0]; i < 3 * imageSize; i++);{ // destPixeldestImage.m_pixels[1i] = (uint8_t(blurredPixel[1]); // destPixelpinnedImage[2i] = uint8_t(blurredPixel[2]); // } // } //};
//// vertical blur from tmpImage into destImagecheck(cudaFree(d_xblur)); //{ // auto row = GaussianKernelIntegralscheck(cudaFree(yblursigma, yblursized_yblur));
// int startOffset = -1 * intcheck(row.sizecudaFreeHost(pinnedImage)); check(cudaFree(d_padded1)); check(cudaFree(d_padded2) / 2);
// for check(int y = 0; y < destImage.m_height; ++y) // { // for (int x = 0; x < destImage.m_width; ++x) // { // std::array<float, 3> blurredPixel = { { 0.0f, 0.0f, 0.0f } }; // for (unsigned int i = 0; i < row.size(); ++i) // { // const uint8_t *pixel = GetPixelOrBlack(tmpImage, x, y + startOffset + i); // blurredPixel[0] += float(pixel[0]) * row[i]; // blurredPixel[1] += float(pixel[1]) * row[i]; // blurredPixel[2] += float(pixel[2]) * row[i]; // }  // uint8_t *destPixel = &destImage.m_pixels[y * destImage.m_pitch + x * 3];  // destPixel[0] = uint8_t(blurredPixel[0]); // destPixel[1] = uint8_tcudaDeviceReset(blurredPixel[1]); // destPixel[2] = uint8_t(blurredPixel[2]); // } // } //}
}
printf("%s loaded\n", srcFileName);
SImageData destImage;
 
auto t1 = std::chrono::high_resolution_clock::now();
BlurImage(srcImage, destImage, xblursigma, yblursigma, xblursize, yblursize);
auto t2 = std::chrono::high_resolution_clock::now();
 
std::cout << "BlurImage time: " << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << "us" << std::endl;
 
 
if (SaveImage(destFileName, destImage))
printf("Blurred image saved as %s\n", destFileName);
49
edits

Navigation menu