Difference between revisions of "Team Lion"
Byungho Kim (talk | contribs) (→Image Processing Performance Using Parallel Programming) |
Byungho Kim (talk | contribs) (→Source Code) |
||
(6 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
{{GPU621/DPS921 Index | 20161}} | {{GPU621/DPS921 Index | 20161}} | ||
= Image Processing Performance Using Parallel Programming = | = Image Processing Performance Using Parallel Programming = | ||
− | This assignment introduces simple image processing using MPI parallel programming. | + | This assignment introduces simple image processing using MPI parallel programming. This assignment also explains about performance comparison from 1 core to 16 cores. |
+ | |||
== Team Lion Member == | == Team Lion Member == | ||
Byungho Kim | Byungho Kim | ||
+ | |||
== Basic Concept of Image Processing == | == Basic Concept of Image Processing == | ||
+ | |||
* Convolution between image and filter(kernel) | * Convolution between image and filter(kernel) | ||
* Each processing of pixel independent from another pixels | * Each processing of pixel independent from another pixels | ||
* Can be used for Edge finding, Blur and image enhancement. | * Can be used for Edge finding, Blur and image enhancement. | ||
− | [[Image:GPU621-LION-1.png| | + | [[Image:GPU621-LION-1.png|640px]] |
+ | |||
+ | |||
+ | == Convolution == | ||
+ | |||
+ | [[Image:GPU621-LION-2.png|640px]] | ||
+ | |||
+ | [[Image:GPU621-LION-3.png|640px]] | ||
+ | |||
+ | |||
+ | == Edge Handling == | ||
+ | |||
+ | All pixels in edges need special treatment. | ||
+ | * Extend | ||
+ | * Tile | ||
+ | * Crop | ||
+ | |||
+ | [[Image:GPU621-LION-4.png|640px|Edge Handling(Extend)]] | ||
+ | |||
+ | |||
+ | == False Sharing Consideration == | ||
+ | *Threading each pixel – The worst | ||
+ | *Threading each row – Good | ||
+ | *Threading multiple rows band – The best | ||
+ | |||
+ | |||
+ | == Test Environment Consideration == | ||
+ | *How to test performance more than 4 cores computer. | ||
+ | |||
+ | |||
+ | == Azure VM Environment == | ||
+ | *Cloud service from Microsoft | ||
+ | *You can rent many kind of Virtual Machines. | ||
+ | *Access remotely using Windows Remote Desktop Connection. | ||
+ | |||
+ | [[Image:GPU621-LION-6-2.png|640px|Edge Handling(Extend)]] | ||
+ | |||
+ | [[Image:GPU621-LION-6.png|640px|Edge Handling(Extend)]] | ||
+ | |||
+ | |||
+ | == Test Result == | ||
+ | [[Image:GPU621-LION-5.png|640px|Edge Handling(Extend)]] | ||
+ | |||
+ | [[Image:GPU621-LION-7.png|640px|Edge Handling(Extend)]] | ||
+ | |||
+ | [[Image:GPU621-LION-8.png|640px|Edge Handling(Extend)]] | ||
+ | |||
+ | |||
+ | == Conclusion == | ||
+ | *Performance depends on number of cores. | ||
+ | *OpenMP is easy to use. Much easier than normal(POSIX) Thread method. | ||
+ | |||
+ | |||
+ | == Source Code == | ||
+ | |||
+ | <syntaxhighlight lang="c" line="1" > | ||
+ | #include "corona.h" | ||
+ | #include <omp.h> | ||
+ | #include <iostream> | ||
+ | #include <cstdlib> | ||
+ | |||
+ | |||
+ | corona::Image* calcConvolutitonSerial(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight) { | ||
+ | |||
+ | int width = image->getWidth(); | ||
+ | int height = image->getHeight(); | ||
+ | unsigned char* pixels = (unsigned char*)(image->getPixels()); | ||
+ | |||
+ | corona::Image* result = corona::CloneImage(image); | ||
+ | unsigned char* resultPixels = (unsigned char*)(result->getPixels()); | ||
+ | |||
+ | |||
+ | for (int py = 0; py < height; py++) { | ||
+ | |||
+ | int pixelIndex = 0; | ||
+ | int targetPixelX = 0; | ||
+ | int targetPixelY = 0; | ||
+ | |||
+ | |||
+ | for (int px = 0; px < width; px++) { | ||
+ | |||
+ | float accuR = 0; | ||
+ | float accuG = 0; | ||
+ | float accuB = 0; | ||
+ | float accuA = 0; | ||
+ | |||
+ | for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) { | ||
+ | for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) { | ||
+ | |||
+ | targetPixelX = px - kernalSizeWidth / 2 + kernalX; | ||
+ | if (targetPixelX < 0) targetPixelX = 0; | ||
+ | else if (targetPixelX >= width) targetPixelX = width - 1; | ||
+ | |||
+ | targetPixelY = py - kernalSizeHeight / 2 + kernalY; | ||
+ | if (targetPixelY < 0) targetPixelY = 0; | ||
+ | else if (targetPixelY >= height) targetPixelY = height - 1; | ||
+ | |||
+ | pixelIndex = (targetPixelY * width + targetPixelX) * 4; | ||
+ | |||
+ | accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX]; | ||
+ | accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX]; | ||
+ | accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX]; | ||
+ | } | ||
+ | } | ||
+ | |||
+ | pixelIndex = (py * width + px) * 4; | ||
+ | |||
+ | if (accuR > 255) accuR = 255; | ||
+ | else if (accuR < 0) accuR = 0; | ||
+ | resultPixels[pixelIndex++] = accuR; | ||
+ | |||
+ | if (accuG > 255) accuG = 255; | ||
+ | else if (accuG < 0) accuG = 0; | ||
+ | resultPixels[pixelIndex++] = accuG; | ||
+ | |||
+ | if (accuB > 255) accuB = 255; | ||
+ | else if (accuB < 0) accuB = 0; | ||
+ | resultPixels[pixelIndex++] = accuB; | ||
+ | } | ||
+ | } | ||
+ | |||
+ | return result; | ||
+ | } | ||
+ | |||
+ | |||
+ | corona::Image* calcConvolutitonParallel(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight, int requestedCore) { | ||
+ | int width = image->getWidth(); | ||
+ | int height = image->getHeight(); | ||
+ | unsigned char* pixels = (unsigned char*)(image->getPixels()); | ||
+ | |||
+ | corona::Image* result = corona::CloneImage(image); | ||
+ | unsigned char* resultPixels = (unsigned char*)(result->getPixels()); | ||
+ | |||
+ | omp_set_dynamic(0); | ||
+ | omp_set_num_threads(requestedCore); | ||
+ | |||
+ | #pragma omp parallel | ||
+ | { | ||
+ | int tid = omp_get_thread_num(); | ||
+ | int nt = omp_get_num_threads(); | ||
+ | |||
+ | int jumpRange = height / nt; | ||
+ | |||
+ | for (int py = jumpRange * tid; py < jumpRange * (tid + 1) && py < height; py++) { | ||
+ | |||
+ | int pixelIndex = 0; | ||
+ | int targetPixelX = 0; | ||
+ | int targetPixelY = 0; | ||
+ | |||
+ | |||
+ | for (int px = 0; px < width; px++) { | ||
+ | |||
+ | float accuR = 0; | ||
+ | float accuG = 0; | ||
+ | float accuB = 0; | ||
+ | float accuA = 0; | ||
+ | |||
+ | for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) { | ||
+ | for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) { | ||
+ | |||
+ | targetPixelX = px - kernalSizeWidth / 2 + kernalX; | ||
+ | if (targetPixelX < 0) targetPixelX = 0; | ||
+ | else if (targetPixelX >= width) targetPixelX = width - 1; | ||
+ | |||
+ | targetPixelY = py - kernalSizeHeight / 2 + kernalY; | ||
+ | if (targetPixelY < 0) targetPixelY = 0; | ||
+ | else if (targetPixelY >= height) targetPixelY = height - 1; | ||
+ | |||
+ | pixelIndex = (targetPixelY * width + targetPixelX) * 4; | ||
+ | |||
+ | accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX]; | ||
+ | accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX]; | ||
+ | accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX]; | ||
+ | } | ||
+ | } | ||
+ | |||
+ | pixelIndex = (py * width + px) * 4; | ||
+ | |||
+ | if (accuR > 255) accuR = 255; | ||
+ | else if (accuR < 0) accuR = 0; | ||
+ | resultPixels[pixelIndex++] = accuR; | ||
+ | |||
+ | if (accuG > 255) accuG = 255; | ||
+ | else if (accuG < 0) accuG = 0; | ||
+ | resultPixels[pixelIndex++] = accuG; | ||
+ | |||
+ | if (accuB > 255) accuB = 255; | ||
+ | else if (accuB < 0) accuB = 0; | ||
+ | resultPixels[pixelIndex++] = accuB; | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | |||
+ | return result; | ||
+ | } | ||
+ | |||
+ | |||
+ | int main() { | ||
+ | |||
+ | corona::Image* image = corona::OpenImage("sample.jpg", corona::PF_R8G8B8A8); | ||
+ | if (!image) { | ||
+ | return 1; | ||
+ | // error! | ||
+ | } | ||
+ | |||
+ | int width = image->getWidth(); | ||
+ | int height = image->getHeight(); | ||
+ | void* pixels = image->getPixels(); | ||
+ | |||
+ | |||
+ | float Laplacian9x9Kernal[] = { | ||
+ | 0, -1, -1, -2, -2, -2, -1, -1, 0, | ||
+ | -1, -2, -4, -5, -5, -5, -4, -2, -1, | ||
+ | -1, -4, -5, -3, -0, -3, -5, -4, -1, | ||
+ | -2, -5, -3, 12, 24, 12, -3, -5, -2, | ||
+ | -2, -5, -0, 24, 40, 24, -0, -5, -2, | ||
+ | -2, -5, -3, 12, 24, 12, -3, -5, -2, | ||
+ | -1, -4, -5, -3, -0, -3, -5, -4, -1, | ||
+ | -1, -2, -4, -5, -5, -5, -4, -2, -1, | ||
+ | 0, -1, -1, -2, -2, -2, -1, -1, 0 }; | ||
+ | |||
+ | |||
+ | double start; | ||
+ | double end; | ||
+ | |||
+ | |||
+ | start = omp_get_wtime(); | ||
+ | corona::Image* resultImage = calcConvolutitonSerial(image, Laplacian9x9Kernal, 9, 9); | ||
+ | end = omp_get_wtime(); | ||
+ | std::cout << "Serial processing time = " << end - start << std::endl; | ||
+ | |||
+ | //corona::SaveImage("resultSerial.png", corona::FF_AUTODETECT, resultImage); | ||
+ | |||
+ | for (int coreNumber = 1; coreNumber <= 16; coreNumber++) { | ||
+ | start = omp_get_wtime(); | ||
+ | resultImage = calcConvolutitonParallel(image, Laplacian9x9Kernal, 9, 9, coreNumber); | ||
+ | end = omp_get_wtime(); | ||
+ | std::cout << "Parllel processing time (" << coreNumber << " core(s)) = " << end - start << std::endl; | ||
+ | } | ||
+ | //corona::SaveImage("resultParallel.png", corona::FF_AUTODETECT, resultImage); | ||
+ | } | ||
+ | |||
+ | </syntaxhighlight> |
Latest revision as of 10:09, 5 April 2016
GPU621/DPS921 | Participants | Groups and Projects | Resources | Glossary
Contents
Image Processing Performance Using Parallel Programming
This assignment introduces simple image processing using MPI parallel programming. This assignment also explains about performance comparison from 1 core to 16 cores.
Team Lion Member
Byungho Kim
Basic Concept of Image Processing
- Convolution between image and filter(kernel)
- Each processing of pixel independent from another pixels
- Can be used for Edge finding, Blur and image enhancement.
Convolution
Edge Handling
All pixels in edges need special treatment.
- Extend
- Tile
- Crop
False Sharing Consideration
- Threading each pixel – The worst
- Threading each row – Good
- Threading multiple rows band – The best
Test Environment Consideration
- How to test performance more than 4 cores computer.
Azure VM Environment
- Cloud service from Microsoft
- You can rent many kind of Virtual Machines.
- Access remotely using Windows Remote Desktop Connection.
Test Result
Conclusion
- Performance depends on number of cores.
- OpenMP is easy to use. Much easier than normal(POSIX) Thread method.
Source Code
#include "corona.h"
#include <omp.h>
#include <iostream>
#include <cstdlib>
corona::Image* calcConvolutitonSerial(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight) {
int width = image->getWidth();
int height = image->getHeight();
unsigned char* pixels = (unsigned char*)(image->getPixels());
corona::Image* result = corona::CloneImage(image);
unsigned char* resultPixels = (unsigned char*)(result->getPixels());
for (int py = 0; py < height; py++) {
int pixelIndex = 0;
int targetPixelX = 0;
int targetPixelY = 0;
for (int px = 0; px < width; px++) {
float accuR = 0;
float accuG = 0;
float accuB = 0;
float accuA = 0;
for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) {
for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) {
targetPixelX = px - kernalSizeWidth / 2 + kernalX;
if (targetPixelX < 0) targetPixelX = 0;
else if (targetPixelX >= width) targetPixelX = width - 1;
targetPixelY = py - kernalSizeHeight / 2 + kernalY;
if (targetPixelY < 0) targetPixelY = 0;
else if (targetPixelY >= height) targetPixelY = height - 1;
pixelIndex = (targetPixelY * width + targetPixelX) * 4;
accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
}
}
pixelIndex = (py * width + px) * 4;
if (accuR > 255) accuR = 255;
else if (accuR < 0) accuR = 0;
resultPixels[pixelIndex++] = accuR;
if (accuG > 255) accuG = 255;
else if (accuG < 0) accuG = 0;
resultPixels[pixelIndex++] = accuG;
if (accuB > 255) accuB = 255;
else if (accuB < 0) accuB = 0;
resultPixels[pixelIndex++] = accuB;
}
}
return result;
}
corona::Image* calcConvolutitonParallel(corona::Image* image, const float* kernal, const int kernalSizeWidth, const int kernalSizeHeight, int requestedCore) {
int width = image->getWidth();
int height = image->getHeight();
unsigned char* pixels = (unsigned char*)(image->getPixels());
corona::Image* result = corona::CloneImage(image);
unsigned char* resultPixels = (unsigned char*)(result->getPixels());
omp_set_dynamic(0);
omp_set_num_threads(requestedCore);
#pragma omp parallel
{
int tid = omp_get_thread_num();
int nt = omp_get_num_threads();
int jumpRange = height / nt;
for (int py = jumpRange * tid; py < jumpRange * (tid + 1) && py < height; py++) {
int pixelIndex = 0;
int targetPixelX = 0;
int targetPixelY = 0;
for (int px = 0; px < width; px++) {
float accuR = 0;
float accuG = 0;
float accuB = 0;
float accuA = 0;
for (int kernalY = 0; kernalY < kernalSizeHeight; kernalY++) {
for (int kernalX = 0; kernalX < kernalSizeWidth; kernalX++) {
targetPixelX = px - kernalSizeWidth / 2 + kernalX;
if (targetPixelX < 0) targetPixelX = 0;
else if (targetPixelX >= width) targetPixelX = width - 1;
targetPixelY = py - kernalSizeHeight / 2 + kernalY;
if (targetPixelY < 0) targetPixelY = 0;
else if (targetPixelY >= height) targetPixelY = height - 1;
pixelIndex = (targetPixelY * width + targetPixelX) * 4;
accuR += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
accuG += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
accuB += pixels[pixelIndex++] * kernal[kernalY * kernalSizeWidth + kernalX];
}
}
pixelIndex = (py * width + px) * 4;
if (accuR > 255) accuR = 255;
else if (accuR < 0) accuR = 0;
resultPixels[pixelIndex++] = accuR;
if (accuG > 255) accuG = 255;
else if (accuG < 0) accuG = 0;
resultPixels[pixelIndex++] = accuG;
if (accuB > 255) accuB = 255;
else if (accuB < 0) accuB = 0;
resultPixels[pixelIndex++] = accuB;
}
}
}
return result;
}
int main() {
corona::Image* image = corona::OpenImage("sample.jpg", corona::PF_R8G8B8A8);
if (!image) {
return 1;
// error!
}
int width = image->getWidth();
int height = image->getHeight();
void* pixels = image->getPixels();
float Laplacian9x9Kernal[] = {
0, -1, -1, -2, -2, -2, -1, -1, 0,
-1, -2, -4, -5, -5, -5, -4, -2, -1,
-1, -4, -5, -3, -0, -3, -5, -4, -1,
-2, -5, -3, 12, 24, 12, -3, -5, -2,
-2, -5, -0, 24, 40, 24, -0, -5, -2,
-2, -5, -3, 12, 24, 12, -3, -5, -2,
-1, -4, -5, -3, -0, -3, -5, -4, -1,
-1, -2, -4, -5, -5, -5, -4, -2, -1,
0, -1, -1, -2, -2, -2, -1, -1, 0 };
double start;
double end;
start = omp_get_wtime();
corona::Image* resultImage = calcConvolutitonSerial(image, Laplacian9x9Kernal, 9, 9);
end = omp_get_wtime();
std::cout << "Serial processing time = " << end - start << std::endl;
//corona::SaveImage("resultSerial.png", corona::FF_AUTODETECT, resultImage);
for (int coreNumber = 1; coreNumber <= 16; coreNumber++) {
start = omp_get_wtime();
resultImage = calcConvolutitonParallel(image, Laplacian9x9Kernal, 9, 9, coreNumber);
end = omp_get_wtime();
std::cout << "Parllel processing time (" << coreNumber << " core(s)) = " << end - start << std::endl;
}
//corona::SaveImage("resultParallel.png", corona::FF_AUTODETECT, resultImage);
}