Changes

← Older edit

PIL Cuda

3,069 bytes added, 15:24, 10 December 2014

→‎Pillow and Guassian Blur in CUDA

This is rather simplified, as the `Imaging` struct contains many more properties and we're taking advantage of the fact that the library separates an incoming image into separate `Imaging` structs for each colour channel.

=== Gaussian Blur in CUDA===

While all the code is available on github https://github.com/GabrielCastro/Pillow this is a simplified psudo code version of the blur process

__global__ static void blurRows(const px8_t* __restrict__ in, float* __restrict__ buff,

const size_t xSize, const size_t ySize,

const float* __restrict__ mask, const size_t radius) {

const size_t y = blockIdx.y * blockDim.y + threadIdx.y;

const size_t x = blockIdx.x * blockDim.x + threadIdx.x;

if (y >= ySize || x >= xSize) {

return;

}

const size_t buffIdx = y * xSize + x;

float sum = 0;

for (size_t p = 0; p < radius; ++p) {

float maskVal = mask[p];

int offset = (int)(-((float)radius / 2.0) + (float)p + 0.5);

int xOff = x + offset;

if (xOff < 0) {

offset = -x;

} else if (xOff >= xSize) {

offset = xSize - x - 1;

}

size_t pxIndex = buffIdx + offset;

sum += in[pxIndex] * maskVal;

}

buff[buffIdx] = sum;

}

__global__ static void blurCols(const float* __restrict__ buff, px8_t* __restrict__ out,

const size_t xSize, const size_t ySize,

const float* __restrict__ mask, const size_t radius) {

const size_t y = blockIdx.y * blockDim.y + threadIdx.y;

const size_t x = blockIdx.x * blockDim.x + threadIdx.x;

if (y >= ySize || x >= xSize) {

return;

}

size_t outIdx = y * xSize + x;

float sum = 0;

for (size_t p = 0; p < radius; ++p) {

float maskVal = mask[p];

int offset = (int)(-((float)radius / 2.0) + (float)p + 0.5);

int lOff = y + offset;

if (lOff < 0) {

offset = -y;

} else if (lOff >= ySize) {

offset = ySize - y - 1;

}

size_t buffIdx = outIdx + offset * xSize;

sum += buff[buffIdx] * maskVal;

}

out[outIdx] = (px8_t)CLIP(sum);

}

gblur(Imaging in, Imagin out, int radius) {

UINT8* d_img = // allocate in->xsize * in->ysize on device

float* d_buff = // allocate a tmp buffer on device of size in->ysize * in->xsize

float* d_mask = // allocate and create a guassian blur mask on device

// copy the image into the device

size_t rowSize = in->xsize * sizeof(UINT8);

for (int i = 0; i < in->ysize; ++i) {

cudaMemcpyAsync(d_img + i * rowSize, in->image8[i], rowSize, cudaMemcpyHostToDevice, 0);

}

size_t xBlocks = (in->xsize + ntpb - 1) / ntpb;

size_t yBlocks = (in->ysize + ntpb - 1) / ntpb;

dim3 grid(xBlocks, yBlocks);

dim3 block(ntpb, ntpb);

blurRows<<<grid,block>>>(d_img, d_buff, in->xsize, in->ysize, d_mask, radius);

blurCols<<<grid,block>>>(d_buff, d_img, in->xsize, in->ysize, d_mask, radius);

for (int i = 0; i < in->ysize; ++i) {

cudaMemcpyAsync(out->image8[i], d_img + i * rowSize, rowSize, cudaMemcpyDeviceToHost, 0);

}

cudaDeviceSynchronize();

cudaFree(d_img);

cudaFree(d_buff);

cudaFree((void*) d_mask);

}

Gabriel Castro Londono

1

edit

Changes

PIL Cuda

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools