Changes

Jump to: navigation, search

PIL Cuda

3,069 bytes added, 14:24, 10 December 2014
Pillow and Guassian Blur in CUDA
This is rather simplified, as the `Imaging` struct contains many more properties and we're taking advantage of the fact that the library separates an incoming image into separate `Imaging` structs for each colour channel.
 
=== Gaussian Blur in CUDA===
While all the code is available on github https://github.com/GabrielCastro/Pillow this is a simplified psudo code version of the blur process
 
__global__ static void blurRows(const px8_t* __restrict__ in, float* __restrict__ buff,
const size_t xSize, const size_t ySize,
const float* __restrict__ mask, const size_t radius) {
const size_t y = blockIdx.y * blockDim.y + threadIdx.y;
const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
if (y >= ySize || x >= xSize) {
return;
}
const size_t buffIdx = y * xSize + x;
float sum = 0;
for (size_t p = 0; p < radius; ++p) {
float maskVal = mask[p];
int offset = (int)(-((float)radius / 2.0) + (float)p + 0.5);
int xOff = x + offset;
if (xOff < 0) {
offset = -x;
} else if (xOff >= xSize) {
offset = xSize - x - 1;
}
size_t pxIndex = buffIdx + offset;
sum += in[pxIndex] * maskVal;
}
buff[buffIdx] = sum;
}
 
 
__global__ static void blurCols(const float* __restrict__ buff, px8_t* __restrict__ out,
const size_t xSize, const size_t ySize,
const float* __restrict__ mask, const size_t radius) {
const size_t y = blockIdx.y * blockDim.y + threadIdx.y;
const size_t x = blockIdx.x * blockDim.x + threadIdx.x;
if (y >= ySize || x >= xSize) {
return;
}
size_t outIdx = y * xSize + x;
float sum = 0;
for (size_t p = 0; p < radius; ++p) {
float maskVal = mask[p];
int offset = (int)(-((float)radius / 2.0) + (float)p + 0.5);
int lOff = y + offset;
if (lOff < 0) {
offset = -y;
} else if (lOff >= ySize) {
offset = ySize - y - 1;
}
size_t buffIdx = outIdx + offset * xSize;
sum += buff[buffIdx] * maskVal;
}
out[outIdx] = (px8_t)CLIP(sum);
}
 
 
gblur(Imaging in, Imagin out, int radius) {
UINT8* d_img = // allocate in->xsize * in->ysize on device
float* d_buff = // allocate a tmp buffer on device of size in->ysize * in->xsize
float* d_mask = // allocate and create a guassian blur mask on device
// copy the image into the device
size_t rowSize = in->xsize * sizeof(UINT8);
for (int i = 0; i < in->ysize; ++i) {
cudaMemcpyAsync(d_img + i * rowSize, in->image8[i], rowSize, cudaMemcpyHostToDevice, 0);
}
size_t xBlocks = (in->xsize + ntpb - 1) / ntpb;
size_t yBlocks = (in->ysize + ntpb - 1) / ntpb;
dim3 grid(xBlocks, yBlocks);
dim3 block(ntpb, ntpb);
blurRows<<<grid,block>>>(d_img, d_buff, in->xsize, in->ysize, d_mask, radius);
blurCols<<<grid,block>>>(d_buff, d_img, in->xsize, in->ysize, d_mask, radius);
for (int i = 0; i < in->ysize; ++i) {
cudaMemcpyAsync(out->image8[i], d_img + i * rowSize, rowSize, cudaMemcpyDeviceToHost, 0);
}
cudaDeviceSynchronize();
cudaFree(d_img);
cudaFree(d_buff);
cudaFree((void*) d_mask);
}

Navigation menu