Latest revision as of 10:09, 5 April 2016

GPU621/DPS921 | Participants | Groups and Projects | Resources | Glossary

Image Processing Performance Using Parallel Programming

This assignment introduces simple image processing using MPI parallel programming. This assignment also explains about performance comparison from 1 core to 16 cores.

Team Lion Member

Byungho Kim

Basic Concept of Image Processing

Convolution between image and filter(kernel)
Each processing of pixel independent from another pixels
Can be used for Edge finding, Blur and image enhancement.

Convolution

Edge Handling

All pixels in edges need special treatment.

Extend
Tile
Crop

False Sharing Consideration

Threading each pixel – The worst
Threading each row – Good
Threading multiple rows band – The best

Test Environment Consideration

How to test performance more than 4 cores computer.

Azure VM Environment

Cloud service from Microsoft
You can rent many kind of Virtual Machines.
Access remotely using Windows Remote Desktop Connection.

Test Result

Conclusion

Performance depends on number of cores.
OpenMP is easy to use. Much easier than normal(POSIX) Thread method.

Source Code

#include "corona.h"
#include <omp.h>
#include <iostream>
#include <cstdlib>


corona::Image* calcConvolutitonSerial(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight) {

    int width = image->getWidth();
    int height = image->getHeight();
    unsigned char* pixels = (unsigned char*)(image->getPixels());

    corona::Image* result = corona::CloneImage(image);
    unsigned char* resultPixels = (unsigned char*)(result->getPixels());


    for (int py = 0; py < height; py++) {

        int pixelIndex = 0;
        int targetPixelX = 0;
        int targetPixelY = 0;


        for (int px = 0; px < width; px++) {

            float accuR = 0;
            float accuG = 0;
            float accuB = 0;
            float accuA = 0;

            for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) {
                for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) {

                    targetPixelX = px - kernalSizeWidth / 2 + kernalX;
                    if (targetPixelX < 0) targetPixelX = 0;
                    else if (targetPixelX >= width) targetPixelX = width - 1;

                    targetPixelY = py - kernalSizeHeight / 2 + kernalY;
                    if (targetPixelY < 0) targetPixelY = 0;
                    else if (targetPixelY >= height) targetPixelY = height - 1;

                    pixelIndex = (targetPixelY * width + targetPixelX) * 4;

                    accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                    accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                    accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                }
            }

            pixelIndex = (py * width + px) * 4;

            if (accuR > 255)  accuR = 255;
            else  if (accuR < 0) accuR = 0;
            resultPixels[pixelIndex++] = accuR;

            if (accuG > 255)  accuG = 255;
            else  if (accuG < 0) accuG = 0;
            resultPixels[pixelIndex++] = accuG;

            if (accuB > 255)  accuB = 255;
            else  if (accuB < 0) accuB = 0;
            resultPixels[pixelIndex++] = accuB;
        }
    }

    return result;
}


corona::Image* calcConvolutitonParallel(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight, int requestedCore) {
    int width = image->getWidth();
    int height = image->getHeight();
    unsigned char* pixels = (unsigned char*)(image->getPixels());

    corona::Image* result = corona::CloneImage(image);
    unsigned char* resultPixels = (unsigned char*)(result->getPixels());

    omp_set_dynamic(0);
    omp_set_num_threads(requestedCore);

#pragma omp parallel
    {
        int tid = omp_get_thread_num();
        int nt = omp_get_num_threads();

        int jumpRange = height / nt;

        for (int py = jumpRange * tid; py < jumpRange * (tid + 1) && py < height; py++) {

            int pixelIndex = 0;
            int targetPixelX = 0;
            int targetPixelY = 0;


            for (int px = 0; px < width; px++) {

                float accuR = 0;
                float accuG = 0;
                float accuB = 0;
                float accuA = 0;

                for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) {
                    for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) {

                        targetPixelX = px - kernalSizeWidth / 2 + kernalX;
                        if (targetPixelX < 0) targetPixelX = 0;
                        else if (targetPixelX >= width) targetPixelX = width - 1;

                        targetPixelY = py - kernalSizeHeight / 2 + kernalY;
                        if (targetPixelY < 0) targetPixelY = 0;
                        else if (targetPixelY >= height) targetPixelY = height - 1;

                        pixelIndex = (targetPixelY * width + targetPixelX) * 4;

                        accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                        accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                        accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
                    }
                }

                pixelIndex = (py * width + px) * 4;

                if (accuR > 255)  accuR = 255;
                else  if (accuR < 0) accuR = 0;
                resultPixels[pixelIndex++] = accuR;

                if (accuG > 255)  accuG = 255;
                else  if (accuG < 0) accuG = 0;
                resultPixels[pixelIndex++] = accuG;

                if (accuB > 255)  accuB = 255;
                else  if (accuB < 0) accuB = 0;
                resultPixels[pixelIndex++] = accuB;
            }
        }
    }

    return result;
}


int main() {

    corona::Image* image = corona::OpenImage("sample.jpg", corona::PF_R8G8B8A8);
    if (!image) {
        return 1;
        // error!
    }

    int width = image->getWidth();
    int height = image->getHeight();
    void* pixels = image->getPixels();


    float Laplacian9x9Kernal[] = {
        0, -1, -1, -2, -2, -2, -1, -1, 0,
        -1, -2, -4, -5, -5, -5, -4, -2, -1,
        -1, -4, -5, -3, -0, -3, -5, -4, -1,
        -2, -5, -3, 12, 24, 12, -3, -5, -2,
        -2, -5, -0, 24, 40, 24, -0, -5, -2,
        -2, -5, -3, 12, 24, 12, -3, -5, -2,
        -1, -4, -5, -3, -0, -3, -5, -4, -1,
        -1, -2, -4, -5, -5, -5, -4, -2, -1,
        0, -1, -1, -2, -2, -2, -1, -1,  0 };


    double start;
    double end;


    start = omp_get_wtime();
    corona::Image* resultImage = calcConvolutitonSerial(image, Laplacian9x9Kernal, 9, 9);
    end = omp_get_wtime();
    std::cout << "Serial processing time = " << end - start << std::endl;

    //corona::SaveImage("resultSerial.png", corona::FF_AUTODETECT, resultImage);

    for (int coreNumber = 1; coreNumber <= 16; coreNumber++) {
        start = omp_get_wtime();
        resultImage = calcConvolutitonParallel(image, Laplacian9x9Kernal, 9, 9, coreNumber);
        end = omp_get_wtime();
        std::cout << "Parllel processing time (" << coreNumber << " core(s)) = " << end - start << std::endl;
    }
    //corona::SaveImage("resultParallel.png", corona::FF_AUTODETECT, resultImage);
}

Difference between revisions of "Team Lion"

Latest revision as of 10:09, 5 April 2016

Contents

Image Processing Performance Using Parallel Programming

Team Lion Member

Basic Concept of Image Processing

Convolution

Edge Handling

False Sharing Consideration

Test Environment Consideration

Azure VM Environment

Test Result

Conclusion

Source Code

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools

@@ Line 1: / Line 1: @@
 {{GPU621/DPS921 Index | 20161}}
 = Image Processing Performance Using Parallel Programming =
-This assignment introduces simple image processing using MPI parallel programming.
+This assignment introduces simple image processing using MPI parallel programming. This assignment also explains about performance comparison from 1 core to 16 cores.
 == Team Lion Member ==
 Byungho Kim
 == Basic Concept of Image Processing ==
 * Convolution between image and filter(kernel)
 * Each processing of pixel independent from another pixels
 * Can be used for Edge finding, Blur and image enhancement.
-[[Image:GPU621-LION-1.png|thumb|| ]]
+[[Image:GPU621-LION-1.png|640px]]
+== Convolution ==
+[[Image:GPU621-LION-2.png|640px]]
+[[Image:GPU621-LION-3.png|640px]]
+== Edge Handling ==
+All pixels in edges need special treatment.
+* Extend
+* Tile
+* Crop
+[[Image:GPU621-LION-4.png|640px|Edge Handling(Extend)]]
+== False Sharing Consideration ==
+*Threading each pixel – The worst
+*Threading each row – Good
+*Threading multiple rows band – The best
+== Test Environment Consideration ==
+*How to test performance more than 4 cores computer.
+== Azure VM Environment ==
+*Cloud service from Microsoft
+*You can rent many kind of Virtual Machines.
+*Access remotely using Windows Remote Desktop Connection.
+[[Image:GPU621-LION-6-2.png|640px|Edge Handling(Extend)]]
+[[Image:GPU621-LION-6.png|640px|Edge Handling(Extend)]]
+== Test Result ==
+[[Image:GPU621-LION-5.png|640px|Edge Handling(Extend)]]
+[[Image:GPU621-LION-7.png|640px|Edge Handling(Extend)]]
+[[Image:GPU621-LION-8.png|640px|Edge Handling(Extend)]]
+== Conclusion ==
+*Performance depends on number of cores.
+*OpenMP is easy to use. Much easier than normal(POSIX) Thread method.
+== Source Code ==
+<syntaxhighlight lang="c" line="1" >
+#include "corona.h"
+#include <omp.h>
+#include <iostream>
+#include <cstdlib>
+corona::Image* calcConvolutitonSerial(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight) {
+    int width = image->getWidth();
+    int height = image->getHeight();
+    unsigned char* pixels = (unsigned char*)(image->getPixels());
+    corona::Image* result = corona::CloneImage(image);
+    unsigned char* resultPixels = (unsigned char*)(result->getPixels());
+    for (int py = 0; py < height; py++) {
+        int pixelIndex = 0;
+        int targetPixelX = 0;
+        int targetPixelY = 0;
+        for (int px = 0; px < width; px++) {
+            float accuR = 0;
+            float accuG = 0;
+            float accuB = 0;
+            float accuA = 0;
+            for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) {
+                for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) {
+                    targetPixelX = px - kernalSizeWidth / 2 + kernalX;
+                    if (targetPixelX < 0) targetPixelX = 0;
+                    else if (targetPixelX >= width) targetPixelX = width - 1;
+                    targetPixelY = py - kernalSizeHeight / 2 + kernalY;
+                    if (targetPixelY < 0) targetPixelY = 0;
+                    else if (targetPixelY >= height) targetPixelY = height - 1;
+                    pixelIndex = (targetPixelY * width + targetPixelX) * 4;
+                    accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
+                    accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
+                    accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
+                }
+            }
+            pixelIndex = (py * width + px) * 4;
+            if (accuR > 255)  accuR = 255;
+            else  if (accuR < 0) accuR = 0;
+            resultPixels[pixelIndex++] = accuR;
+            if (accuG > 255)  accuG = 255;
+            else  if (accuG < 0) accuG = 0;
+            resultPixels[pixelIndex++] = accuG;
+            if (accuB > 255)  accuB = 255;
+            else  if (accuB < 0) accuB = 0;
+            resultPixels[pixelIndex++] = accuB;
+        }
+    }
+    return result;
+}
+corona::Image* calcConvolutitonParallel(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight, int requestedCore) {
+    int width = image->getWidth();
+    int height = image->getHeight();
+    unsigned char* pixels = (unsigned char*)(image->getPixels());
+    corona::Image* result = corona::CloneImage(image);
+    unsigned char* resultPixels = (unsigned char*)(result->getPixels());
+    omp_set_dynamic(0);
+    omp_set_num_threads(requestedCore);
+#pragma omp parallel
+    {
+        int tid = omp_get_thread_num();
+        int nt = omp_get_num_threads();
+        int jumpRange = height / nt;
+        for (int py = jumpRange * tid; py < jumpRange * (tid + 1) && py < height; py++) {
+            int pixelIndex = 0;
+            int targetPixelX = 0;
+            int targetPixelY = 0;
+            for (int px = 0; px < width; px++) {
+                float accuR = 0;
+                float accuG = 0;
+                float accuB = 0;
+                float accuA = 0;
+                for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) {
+                    for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) {
+                        targetPixelX = px - kernalSizeWidth / 2 + kernalX;
+                        if (targetPixelX < 0) targetPixelX = 0;
+                        else if (targetPixelX >= width) targetPixelX = width - 1;
+                        targetPixelY = py - kernalSizeHeight / 2 + kernalY;
+                        if (targetPixelY < 0) targetPixelY = 0;
+                        else if (targetPixelY >= height) targetPixelY = height - 1;
+                        pixelIndex = (targetPixelY * width + targetPixelX) * 4;
+                        accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
+                        accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
+                        accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
+                    }
+                }
+                pixelIndex = (py * width + px) * 4;
+                if (accuR > 255)  accuR = 255;
+                else  if (accuR < 0) accuR = 0;
+                resultPixels[pixelIndex++] = accuR;
+                if (accuG > 255)  accuG = 255;
+                else  if (accuG < 0) accuG = 0;
+                resultPixels[pixelIndex++] = accuG;
+                if (accuB > 255)  accuB = 255;
+                else  if (accuB < 0) accuB = 0;
+                resultPixels[pixelIndex++] = accuB;
+            }
+        }
+    }
+    return result;
+}
+int main() {
+    corona::Image* image = corona::OpenImage("sample.jpg", corona::PF_R8G8B8A8);
+    if (!image) {
+        return 1;
+        // error!
+    }
+    int width = image->getWidth();
+    int height = image->getHeight();
+    void* pixels = image->getPixels();
+    float Laplacian9x9Kernal[] = {
+, -1, -1, -2, -2, -2, -1, -1, 0,
+        -1, -2, -4, -5, -5, -5, -4, -2, -1,
+        -1, -4, -5, -3, -0, -3, -5, -4, -1,
+        -2, -5, -3, 12, 24, 12, -3, -5, -2,
+        -2, -5, -0, 24, 40, 24, -0, -5, -2,
+        -2, -5, -3, 12, 24, 12, -3, -5, -2,
+        -1, -4, -5, -3, -0, -3, -5, -4, -1,
+        -1, -2, -4, -5, -5, -5, -4, -2, -1,
+, -1, -1, -2, -2, -2, -1, -1,  0 };
+    double start;
+    double end;
+    start = omp_get_wtime();
+    corona::Image* resultImage = calcConvolutitonSerial(image, Laplacian9x9Kernal, 9, 9);
+    end = omp_get_wtime();
+    std::cout << "Serial processing time = " << end - start << std::endl;
+    //corona::SaveImage("resultSerial.png", corona::FF_AUTODETECT, resultImage);
+    for (int coreNumber = 1; coreNumber <= 16; coreNumber++) {
+        start = omp_get_wtime();
+        resultImage = calcConvolutitonParallel(image, Laplacian9x9Kernal, 9, 9, coreNumber);
+        end = omp_get_wtime();
+        std::cout << "Parllel processing time (" << coreNumber << " core(s)) = " << end - start << std::endl;
+    }
+    //corona::SaveImage("resultParallel.png", corona::FF_AUTODETECT, resultImage);
+}
+</syntaxhighlight>