Difference between revisions of "Team Lion"

From CDOT Wiki
Jump to: navigation, search
(Image Processing Performance Using Parallel Programming)
(Source Code)
 
(5 intermediate revisions by the same user not shown)
Line 1: Line 1:
 
{{GPU621/DPS921 Index | 20161}}
 
{{GPU621/DPS921 Index | 20161}}
 
= Image Processing Performance Using Parallel Programming =
 
= Image Processing Performance Using Parallel Programming =
This assignment introduces simple image processing using MPI parallel programming.
+
This assignment introduces simple image processing using MPI parallel programming. This assignment also explains about performance comparison from 1 core to 16 cores.
 +
 
  
 
== Team Lion Member ==
 
== Team Lion Member ==
 
Byungho Kim
 
Byungho Kim
 +
  
 
== Basic Concept of Image Processing ==
 
== Basic Concept of Image Processing ==
 +
 
* Convolution between image and filter(kernel)
 
* Convolution between image and filter(kernel)
 
* Each processing of pixel independent from another pixels
 
* Each processing of pixel independent from another pixels
 
* Can be used for Edge finding, Blur and image enhancement.
 
* Can be used for Edge finding, Blur and image enhancement.
  
[[Image:GPU621-LION-1.png||640|]]
+
[[Image:GPU621-LION-1.png|640px]]
 +
 
 +
 
 +
== Convolution ==
 +
 
 +
[[Image:GPU621-LION-2.png|640px]]
 +
 
 +
[[Image:GPU621-LION-3.png|640px]]
 +
 
 +
 
 +
== Edge Handling ==
 +
 
 +
All pixels in edges need special treatment.
 +
* Extend
 +
* Tile
 +
* Crop
 +
 
 +
[[Image:GPU621-LION-4.png|640px|Edge Handling(Extend)]]
 +
 
 +
 
 +
== False Sharing Consideration ==
 +
*Threading each pixel – The worst
 +
*Threading each row – Good
 +
*Threading multiple rows band – The best
 +
 
 +
 
 +
== Test Environment Consideration ==
 +
*How to test performance more than 4 cores computer.
 +
 
 +
 
 +
== Azure VM Environment ==
 +
*Cloud service from Microsoft
 +
*You can rent many kind of Virtual Machines.
 +
*Access remotely using Windows Remote Desktop Connection.
 +
 
 +
[[Image:GPU621-LION-6-2.png|640px|Edge Handling(Extend)]]
 +
 
 +
[[Image:GPU621-LION-6.png|640px|Edge Handling(Extend)]]
 +
 
 +
 
 +
== Test Result ==
 +
[[Image:GPU621-LION-5.png|640px|Edge Handling(Extend)]]
 +
 
 +
[[Image:GPU621-LION-7.png|640px|Edge Handling(Extend)]]
 +
 
 +
[[Image:GPU621-LION-8.png|640px|Edge Handling(Extend)]]
 +
 
 +
 
 +
== Conclusion ==
 +
*Performance depends on number of cores.
 +
*OpenMP is easy to use. Much easier than normal(POSIX) Thread method.
 +
 
 +
 
 +
== Source Code ==
 +
 
 +
<syntaxhighlight lang="c" line="1" >
 +
#include "corona.h"
 +
#include <omp.h>
 +
#include <iostream>
 +
#include <cstdlib>
 +
 
 +
 
 +
corona::Image* calcConvolutitonSerial(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight) {
 +
 
 +
    int width = image->getWidth();
 +
    int height = image->getHeight();
 +
    unsigned char* pixels = (unsigned char*)(image->getPixels());
 +
 
 +
    corona::Image* result = corona::CloneImage(image);
 +
    unsigned char* resultPixels = (unsigned char*)(result->getPixels());
 +
 
 +
 
 +
    for (int py = 0; py < height; py++) {
 +
 
 +
        int pixelIndex = 0;
 +
        int targetPixelX = 0;
 +
        int targetPixelY = 0;
 +
 
 +
 
 +
        for (int px = 0; px < width; px++) {
 +
 
 +
            float accuR = 0;
 +
            float accuG = 0;
 +
            float accuB = 0;
 +
            float accuA = 0;
 +
 
 +
            for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) {
 +
                for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) {
 +
 
 +
                    targetPixelX = px - kernalSizeWidth / 2 + kernalX;
 +
                    if (targetPixelX < 0) targetPixelX = 0;
 +
                    else if (targetPixelX >= width) targetPixelX = width - 1;
 +
 
 +
                    targetPixelY = py - kernalSizeHeight / 2 + kernalY;
 +
                    if (targetPixelY < 0) targetPixelY = 0;
 +
                    else if (targetPixelY >= height) targetPixelY = height - 1;
 +
 
 +
                    pixelIndex = (targetPixelY * width + targetPixelX) * 4;
 +
 
 +
                    accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
 +
                    accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
 +
                    accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
 +
                }
 +
            }
 +
 
 +
            pixelIndex = (py * width + px) * 4;
 +
 
 +
            if (accuR > 255)  accuR = 255;
 +
            else  if (accuR < 0) accuR = 0;
 +
            resultPixels[pixelIndex++] = accuR;
 +
 
 +
            if (accuG > 255)  accuG = 255;
 +
            else  if (accuG < 0) accuG = 0;
 +
            resultPixels[pixelIndex++] = accuG;
 +
 
 +
            if (accuB > 255)  accuB = 255;
 +
            else  if (accuB < 0) accuB = 0;
 +
            resultPixels[pixelIndex++] = accuB;
 +
        }
 +
    }
 +
 
 +
    return result;
 +
}
 +
 
 +
 
 +
corona::Image* calcConvolutitonParallel(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight, int requestedCore) {
 +
    int width = image->getWidth();
 +
    int height = image->getHeight();
 +
    unsigned char* pixels = (unsigned char*)(image->getPixels());
 +
 
 +
    corona::Image* result = corona::CloneImage(image);
 +
    unsigned char* resultPixels = (unsigned char*)(result->getPixels());
 +
 
 +
    omp_set_dynamic(0);
 +
    omp_set_num_threads(requestedCore);
 +
 
 +
#pragma omp parallel
 +
    {
 +
        int tid = omp_get_thread_num();
 +
        int nt = omp_get_num_threads();
 +
 
 +
        int jumpRange = height / nt;
 +
 
 +
        for (int py = jumpRange * tid; py < jumpRange * (tid + 1) && py < height; py++) {
 +
 
 +
            int pixelIndex = 0;
 +
            int targetPixelX = 0;
 +
            int targetPixelY = 0;
 +
 
 +
 
 +
            for (int px = 0; px < width; px++) {
 +
 
 +
                float accuR = 0;
 +
                float accuG = 0;
 +
                float accuB = 0;
 +
                float accuA = 0;
 +
 
 +
                for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) {
 +
                    for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) {
 +
 
 +
                        targetPixelX = px - kernalSizeWidth / 2 + kernalX;
 +
                        if (targetPixelX < 0) targetPixelX = 0;
 +
                        else if (targetPixelX >= width) targetPixelX = width - 1;
 +
 
 +
                        targetPixelY = py - kernalSizeHeight / 2 + kernalY;
 +
                        if (targetPixelY < 0) targetPixelY = 0;
 +
                        else if (targetPixelY >= height) targetPixelY = height - 1;
 +
 
 +
                        pixelIndex = (targetPixelY * width + targetPixelX) * 4;
 +
 
 +
                        accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
 +
                        accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
 +
                        accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
 +
                    }
 +
                }
 +
 
 +
                pixelIndex = (py * width + px) * 4;
 +
 
 +
                if (accuR > 255)  accuR = 255;
 +
                else  if (accuR < 0) accuR = 0;
 +
                resultPixels[pixelIndex++] = accuR;
 +
 
 +
                if (accuG > 255)  accuG = 255;
 +
                else  if (accuG < 0) accuG = 0;
 +
                resultPixels[pixelIndex++] = accuG;
 +
 
 +
                if (accuB > 255)  accuB = 255;
 +
                else  if (accuB < 0) accuB = 0;
 +
                resultPixels[pixelIndex++] = accuB;
 +
            }
 +
        }
 +
    }
 +
 
 +
    return result;
 +
}
 +
 
 +
 
 +
int main() {
 +
 
 +
    corona::Image* image = corona::OpenImage("sample.jpg", corona::PF_R8G8B8A8);
 +
    if (!image) {
 +
        return 1;
 +
        // error!
 +
    }
 +
 
 +
    int width = image->getWidth();
 +
    int height = image->getHeight();
 +
    void* pixels = image->getPixels();
 +
 
 +
 
 +
    float Laplacian9x9Kernal[] = {
 +
        0, -1, -1, -2, -2, -2, -1, -1, 0,
 +
        -1, -2, -4, -5, -5, -5, -4, -2, -1,
 +
        -1, -4, -5, -3, -0, -3, -5, -4, -1,
 +
        -2, -5, -3, 12, 24, 12, -3, -5, -2,
 +
        -2, -5, -0, 24, 40, 24, -0, -5, -2,
 +
        -2, -5, -3, 12, 24, 12, -3, -5, -2,
 +
        -1, -4, -5, -3, -0, -3, -5, -4, -1,
 +
        -1, -2, -4, -5, -5, -5, -4, -2, -1,
 +
        0, -1, -1, -2, -2, -2, -1, -1,  0 };
 +
 
 +
 
 +
    double start;
 +
    double end;
 +
 
 +
 
 +
    start = omp_get_wtime();
 +
    corona::Image* resultImage = calcConvolutitonSerial(image, Laplacian9x9Kernal, 9, 9);
 +
    end = omp_get_wtime();
 +
    std::cout << "Serial processing time = " << end - start << std::endl;
 +
 
 +
    //corona::SaveImage("resultSerial.png", corona::FF_AUTODETECT, resultImage);
 +
 
 +
    for (int coreNumber = 1; coreNumber <= 16; coreNumber++) {
 +
        start = omp_get_wtime();
 +
        resultImage = calcConvolutitonParallel(image, Laplacian9x9Kernal, 9, 9, coreNumber);
 +
        end = omp_get_wtime();
 +
        std::cout << "Parllel processing time (" << coreNumber << " core(s)) = " << end - start << std::endl;
 +
    }
 +
    //corona::SaveImage("resultParallel.png", corona::FF_AUTODETECT, resultImage);
 +
}
 +
 
 +
</syntaxhighlight>

Latest revision as of 10:09, 5 April 2016


GPU621/DPS921 | Participants | Groups and Projects | Resources | Glossary

Image Processing Performance Using Parallel Programming

This assignment introduces simple image processing using MPI parallel programming. This assignment also explains about performance comparison from 1 core to 16 cores.


Team Lion Member

Byungho Kim


Basic Concept of Image Processing

  • Convolution between image and filter(kernel)
  • Each processing of pixel independent from another pixels
  • Can be used for Edge finding, Blur and image enhancement.

GPU621-LION-1.png


Convolution

GPU621-LION-2.png

GPU621-LION-3.png


Edge Handling

All pixels in edges need special treatment.

  • Extend
  • Tile
  • Crop

Edge Handling(Extend)


False Sharing Consideration

  • Threading each pixel – The worst
  • Threading each row – Good
  • Threading multiple rows band – The best


Test Environment Consideration

  • How to test performance more than 4 cores computer.


Azure VM Environment

  • Cloud service from Microsoft
  • You can rent many kind of Virtual Machines.
  • Access remotely using Windows Remote Desktop Connection.

Edge Handling(Extend)

Edge Handling(Extend)


Test Result

Edge Handling(Extend)

Edge Handling(Extend)

Edge Handling(Extend)


Conclusion

  • Performance depends on number of cores.
  • OpenMP is easy to use. Much easier than normal(POSIX) Thread method.


Source Code

#include "corona.h"
#include <omp.h>
#include <iostream>
#include <cstdlib>


corona::Image* calcConvolutitonSerial(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight) {

    int width = image->getWidth();
    int height = image->getHeight();
    unsigned char* pixels = (unsigned char*)(image->getPixels());

    corona::Image* result = corona::CloneImage(image);
    unsigned char* resultPixels = (unsigned char*)(result->getPixels());


    for (int py = 0; py < height; py++) {

        int pixelIndex = 0;
        int targetPixelX = 0;
        int targetPixelY = 0;


        for (int px = 0; px < width; px++) {

            float accuR = 0;
            float accuG = 0;
            float accuB = 0;
            float accuA = 0;

            for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) {
                for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) {

                    targetPixelX = px - kernalSizeWidth / 2 + kernalX;
                    if (targetPixelX < 0) targetPixelX = 0;
                    else if (targetPixelX >= width) targetPixelX = width - 1;

                    targetPixelY = py - kernalSizeHeight / 2 + kernalY;
                    if (targetPixelY < 0) targetPixelY = 0;
                    else if (targetPixelY >= height) targetPixelY = height - 1;

                    pixelIndex = (targetPixelY * width + targetPixelX) * 4;

                    accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                    accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                    accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                }
            }

            pixelIndex = (py * width + px) * 4;

            if (accuR > 255)  accuR = 255;
            else  if (accuR < 0) accuR = 0;
            resultPixels[pixelIndex++] = accuR;

            if (accuG > 255)  accuG = 255;
            else  if (accuG < 0) accuG = 0;
            resultPixels[pixelIndex++] = accuG;

            if (accuB > 255)  accuB = 255;
            else  if (accuB < 0) accuB = 0;
            resultPixels[pixelIndex++] = accuB;
        }
    }

    return result;
}


corona::Image* calcConvolutitonParallel(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight, int requestedCore) {
    int width = image->getWidth();
    int height = image->getHeight();
    unsigned char* pixels = (unsigned char*)(image->getPixels());

    corona::Image* result = corona::CloneImage(image);
    unsigned char* resultPixels = (unsigned char*)(result->getPixels());

    omp_set_dynamic(0);
    omp_set_num_threads(requestedCore);

#pragma omp parallel
    {
        int tid = omp_get_thread_num();
        int nt = omp_get_num_threads();

        int jumpRange = height / nt;

        for (int py = jumpRange * tid; py < jumpRange * (tid + 1) && py < height; py++) {

            int pixelIndex = 0;
            int targetPixelX = 0;
            int targetPixelY = 0;


            for (int px = 0; px < width; px++) {

                float accuR = 0;
                float accuG = 0;
                float accuB = 0;
                float accuA = 0;

                for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) {
                    for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) {

                        targetPixelX = px - kernalSizeWidth / 2 + kernalX;
                        if (targetPixelX < 0) targetPixelX = 0;
                        else if (targetPixelX >= width) targetPixelX = width - 1;

                        targetPixelY = py - kernalSizeHeight / 2 + kernalY;
                        if (targetPixelY < 0) targetPixelY = 0;
                        else if (targetPixelY >= height) targetPixelY = height - 1;

                        pixelIndex = (targetPixelY * width + targetPixelX) * 4;

                        accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                        accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                        accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                    }
                }

                pixelIndex = (py * width + px) * 4;

                if (accuR > 255)  accuR = 255;
                else  if (accuR < 0) accuR = 0;
                resultPixels[pixelIndex++] = accuR;

                if (accuG > 255)  accuG = 255;
                else  if (accuG < 0) accuG = 0;
                resultPixels[pixelIndex++] = accuG;

                if (accuB > 255)  accuB = 255;
                else  if (accuB < 0) accuB = 0;
                resultPixels[pixelIndex++] = accuB;
            }
        }
    }

    return result;
}


int main() {

    corona::Image* image = corona::OpenImage("sample.jpg", corona::PF_R8G8B8A8);
    if (!image) {
        return 1;
        // error!
    }

    int width = image->getWidth();
    int height = image->getHeight();
    void* pixels = image->getPixels();


    float Laplacian9x9Kernal[] = {
        0, -1, -1, -2, -2, -2, -1, -1, 0,
        -1, -2, -4, -5, -5, -5, -4, -2, -1,
        -1, -4, -5, -3, -0, -3, -5, -4, -1,
        -2, -5, -3, 12, 24, 12, -3, -5, -2,
        -2, -5, -0, 24, 40, 24, -0, -5, -2,
        -2, -5, -3, 12, 24, 12, -3, -5, -2,
        -1, -4, -5, -3, -0, -3, -5, -4, -1,
        -1, -2, -4, -5, -5, -5, -4, -2, -1,
        0, -1, -1, -2, -2, -2, -1, -1,  0 };


    double start;
    double end;


    start = omp_get_wtime();
    corona::Image* resultImage = calcConvolutitonSerial(image, Laplacian9x9Kernal, 9, 9);
    end = omp_get_wtime();
    std::cout << "Serial processing time = " << end - start << std::endl;

    //corona::SaveImage("resultSerial.png", corona::FF_AUTODETECT, resultImage);

    for (int coreNumber = 1; coreNumber <= 16; coreNumber++) {
        start = omp_get_wtime();
        resultImage = calcConvolutitonParallel(image, Laplacian9x9Kernal, 9, 9, coreNumber);
        end = omp_get_wtime();
        std::cout << "Parllel processing time (" << coreNumber << " core(s)) = " << end - start << std::endl;
    }
    //corona::SaveImage("resultParallel.png", corona::FF_AUTODETECT, resultImage);
}