GPU621/DPS921 | Participants | Groups and Projects | Resources | Glossary

Image Processing Performance Using Parallel Programming

This assignment introduces simple image processing using MPI parallel programming. This assignment also explains about performance comparison from 1 core to 16 cores.

Team Lion Member

Byungho Kim

Basic Concept of Image Processing

Convolution between image and filter(kernel)
Each processing of pixel independent from another pixels
Can be used for Edge finding, Blur and image enhancement.

Convolution

Edge Handling

All pixels in edges need special treatment.

Extend
Tile
Crop

False Sharing Consideration

Threading each pixel – The worst
Threading each row – Good
Threading multiple rows band – The best

Test Environment Consideration

How to test performance more than 4 cores computer.

Azure VM Environment

Cloud service from Microsoft
You can rent many kind of Virtual Machines.
Access remotely using Windows Remote Desktop Connection.

Test Result

Conclusion

Performance depends on number of cores.
OpenMP is easy to use. Much easier than normal(POSIX) Thread method.

Source Code

#include "corona.h"
#include <omp.h>
#include <iostream>
#include <cstdlib>


corona::Image* calcConvolutitonSerial(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight) {

    int width = image->getWidth();
    int height = image->getHeight();
    unsigned char* pixels = (unsigned char*)(image->getPixels());

    corona::Image* result = corona::CloneImage(image);
    unsigned char* resultPixels = (unsigned char*)(result->getPixels());


    for (int py = 0; py < height; py++) {

        int pixelIndex = 0;
        int targetPixelX = 0;
        int targetPixelY = 0;


        for (int px = 0; px < width; px++) {

            float accuR = 0;
            float accuG = 0;
            float accuB = 0;
            float accuA = 0;

            for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) {
                for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) {

                    targetPixelX = px - kernalSizeWidth / 2 + kernalX;
                    if (targetPixelX < 0) targetPixelX = 0;
                    else if (targetPixelX >= width) targetPixelX = width - 1;

                    targetPixelY = py - kernalSizeHeight / 2 + kernalY;
                    if (targetPixelY < 0) targetPixelY = 0;
                    else if (targetPixelY >= height) targetPixelY = height - 1;

                    pixelIndex = (targetPixelY * width + targetPixelX) * 4;

                    accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                    accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                    accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                }
            }

            pixelIndex = (py * width + px) * 4;

            if (accuR > 255)  accuR = 255;
            else  if (accuR < 0) accuR = 0;
            resultPixels[pixelIndex++] = accuR;

            if (accuG > 255)  accuG = 255;
            else  if (accuG < 0) accuG = 0;
            resultPixels[pixelIndex++] = accuG;

            if (accuB > 255)  accuB = 255;
            else  if (accuB < 0) accuB = 0;
            resultPixels[pixelIndex++] = accuB;
        }
    }

    return result;
}


corona::Image* calcConvolutitonParallel(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight, int requestedCore) {
    int width = image->getWidth();
    int height = image->getHeight();
    unsigned char* pixels = (unsigned char*)(image->getPixels());

    corona::Image* result = corona::CloneImage(image);
    unsigned char* resultPixels = (unsigned char*)(result->getPixels());

    omp_set_dynamic(0);
    omp_set_num_threads(requestedCore);

#pragma omp parallel
    {
        int tid = omp_get_thread_num();
        int nt = omp_get_num_threads();

        int jumpRange = height / nt;

        for (int py = jumpRange * tid; py < jumpRange * (tid + 1) && py < height; py++) {

            int pixelIndex = 0;
            int targetPixelX = 0;
            int targetPixelY = 0;


            for (int px = 0; px < width; px++) {

                float accuR = 0;
                float accuG = 0;
                float accuB = 0;
                float accuA = 0;

                for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) {
                    for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) {

                        targetPixelX = px - kernalSizeWidth / 2 + kernalX;
                        if (targetPixelX < 0) targetPixelX = 0;
                        else if (targetPixelX >= width) targetPixelX = width - 1;

                        targetPixelY = py - kernalSizeHeight / 2 + kernalY;
                        if (targetPixelY < 0) targetPixelY = 0;
                        else if (targetPixelY >= height) targetPixelY = height - 1;

                        pixelIndex = (targetPixelY * width + targetPixelX) * 4;

                        accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                        accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                        accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                    }
                }

                pixelIndex = (py * width + px) * 4;

                if (accuR > 255)  accuR = 255;
                else  if (accuR < 0) accuR = 0;
                resultPixels[pixelIndex++] = accuR;

                if (accuG > 255)  accuG = 255;
                else  if (accuG < 0) accuG = 0;
                resultPixels[pixelIndex++] = accuG;

                if (accuB > 255)  accuB = 255;
                else  if (accuB < 0) accuB = 0;
                resultPixels[pixelIndex++] = accuB;
            }
        }
    }

    return result;
}


int main() {

    corona::Image* image = corona::OpenImage("sample.jpg", corona::PF_R8G8B8A8);
    if (!image) {
        return 1;
        // error!
    }

    int width = image->getWidth();
    int height = image->getHeight();
    void* pixels = image->getPixels();


    float Laplacian9x9Kernal[] = {
        0, -1, -1, -2, -2, -2, -1, -1, 0,
        -1, -2, -4, -5, -5, -5, -4, -2, -1,
        -1, -4, -5, -3, -0, -3, -5, -4, -1,
        -2, -5, -3, 12, 24, 12, -3, -5, -2,
        -2, -5, -0, 24, 40, 24, -0, -5, -2,
        -2, -5, -3, 12, 24, 12, -3, -5, -2,
        -1, -4, -5, -3, -0, -3, -5, -4, -1,
        -1, -2, -4, -5, -5, -5, -4, -2, -1,
        0, -1, -1, -2, -2, -2, -1, -1,  0 };


    double start;
    double end;


    start = omp_get_wtime();
    corona::Image* resultImage = calcConvolutitonSerial(image, Laplacian9x9Kernal, 9, 9);
    end = omp_get_wtime();
    std::cout << "Serial processing time = " << end - start << std::endl;

    //corona::SaveImage("resultSerial.png", corona::FF_AUTODETECT, resultImage);

    for (int coreNumber = 1; coreNumber <= 16; coreNumber++) {
        start = omp_get_wtime();
        resultImage = calcConvolutitonParallel(image, Laplacian9x9Kernal, 9, 9, coreNumber);
        end = omp_get_wtime();
        std::cout << "Parllel processing time (" << coreNumber << " core(s)) = " << end - start << std::endl;
    }
    //corona::SaveImage("resultParallel.png", corona::FF_AUTODETECT, resultImage);
}

Team Lion

Contents

Image Processing Performance Using Parallel Programming

Team Lion Member

Basic Concept of Image Processing

Convolution

Edge Handling

False Sharing Consideration

Test Environment Consideration

Azure VM Environment

Test Result

Conclusion

Source Code

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools