Difference between revisions of "Top Solo"
(→Overview) |
(→Issues Encountered) |
||
(17 intermediate revisions by the same user not shown) | |||
Line 18: | Line 18: | ||
I am going to try and profile and optimize the following function from the CImg Library. | I am going to try and profile and optimize the following function from the CImg Library. | ||
This function creates triangles on the screen and allows the user to make them clump up by clicking the mouse buttons. | This function creates triangles on the screen and allows the user to make them clump up by clicking the mouse buttons. | ||
+ | |||
+ | This function, like others in the CImg_demo.cpp library uses the <CImg.h> library. This is a massive library that contains functions and routines responsible for displaying and preparing the environment for Image and Video processing. My function 'Filled Triangles' uses this library and its resources extensively. | ||
Compilation with profiling on Mac OSX: | Compilation with profiling on Mac OSX: | ||
− | |||
− | |||
'''g++ -o CImg_demo CImg_demo.cpp -O2 -g -pg -I.. -Wall -W -ansi -pedantic -Dcimg_use_vt100 -I/usr/X11R6/include -lm -L/usr/X11R6/lib -lpthread -lX11''' | '''g++ -o CImg_demo CImg_demo.cpp -O2 -g -pg -I.. -Wall -W -ansi -pedantic -Dcimg_use_vt100 -I/usr/X11R6/include -lm -L/usr/X11R6/lib -lpthread -lX11''' | ||
Line 27: | Line 27: | ||
+ | === The Program - Filled Triangles === | ||
− | < | + | <source lang="cpp"> |
// Item : Filled Triangles | // Item : Filled Triangles | ||
Line 60: | Line 61: | ||
// Init images and create display window. | // Init images and create display window. | ||
− | |||
CImg<unsigned char> img0(background), img; | CImg<unsigned char> img0(background), img; | ||
Line 66: | Line 66: | ||
CImgDisplay disp(img0,"[#6] - Filled Triangles (Click to shrink)"); | CImgDisplay disp(img0,"[#6] - Filled Triangles (Click to shrink)"); | ||
− | |||
− | |||
// Define random properties (pos, size, colors, ..) for all triangles that will be displayed. | // Define random properties (pos, size, colors, ..) for all triangles that will be displayed. | ||
− | |||
float posx[SIZE], posy[SIZE], rayon[SIZE], angle[SIZE], veloc[SIZE], opacity[SIZE]; | float posx[SIZE], posy[SIZE], rayon[SIZE], angle[SIZE], veloc[SIZE], opacity[SIZE]; | ||
Line 77: | Line 74: | ||
std::srand((unsigned int)time(0)); | std::srand((unsigned int)time(0)); | ||
− | |||
// I'm thinking of offloading these operations to the GPU | // I'm thinking of offloading these operations to the GPU | ||
− | |||
for (int k = 0; k<SIZE; ++k) { | for (int k = 0; k<SIZE; ++k) { | ||
Line 100: | Line 95: | ||
opacity[k] = (float)(0.3 + 1.5*cimg::rand()); | opacity[k] = (float)(0.3 + 1.5*cimg::rand()); | ||
− | |||
} | } | ||
− | |||
− | |||
// Start animation loop. | // Start animation loop. | ||
while (!disp.is_closed() && !disp.is_keyQ() && !disp.is_keyESC()) { | while (!disp.is_closed() && !disp.is_keyQ() && !disp.is_keyESC()) { | ||
− | |||
img = img0; | img = img0; | ||
− | |||
− | |||
/* Maybe offload this for loop to GPU? */ | /* Maybe offload this for loop to GPU? */ | ||
Line 162: | Line 151: | ||
} | } | ||
− | |||
− | |||
// Display current animation framerate, and refresh display window. | // Display current animation framerate, and refresh display window. | ||
Line 172: | Line 159: | ||
if (++num>SIZE) num = SIZE; | if (++num>SIZE) num = SIZE; | ||
− | |||
− | |||
// Allow the user to toggle fullscreen mode, by pressing CTRL+F. | // Allow the user to toggle fullscreen mode, by pressing CTRL+F. | ||
Line 185: | Line 170: | ||
} | } | ||
− | </ | + | |
+ | </source> | ||
=== Initial Profile (on OpenSUSE Linux at Seneca) === | === Initial Profile (on OpenSUSE Linux at Seneca) === | ||
− | + | <pre> | |
Flat profile: | Flat profile: | ||
− | |||
− | |||
− | |||
Each sample counts as 0.01 seconds. | Each sample counts as 0.01 seconds. | ||
Line 283: | Line 266: | ||
0.00 3.10 0.00 1 0.00 136.13 cimg_library::CImgList<float>::get_crop_font() const | 0.00 3.10 0.00 1 0.00 136.13 cimg_library::CImgList<float>::get_crop_font() const | ||
− | </pre | + | </pre> |
=== Modified Makefile for Profiling === | === Modified Makefile for Profiling === | ||
− | < | + | <source lang="bash"> |
− | |||
CIMG_FILES = CImg_demo | CIMG_FILES = CImg_demo | ||
− | |||
− | |||
# Files which requires external libraries to run. | # Files which requires external libraries to run. | ||
Line 300: | Line 280: | ||
gmic | gmic | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
CIMG_VERSION = 1.5.4 | CIMG_VERSION = 1.5.4 | ||
Line 349: | Line 320: | ||
endif | endif | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
# Flags to enable strict code standards | # Flags to enable strict code standards | ||
Line 373: | Line 332: | ||
endif | endif | ||
− | |||
− | |||
− | |||
− | |||
# Flags to enable code debugging. | # Flags to enable code debugging. | ||
CIMG_DEBUG_CFLAGS = -Dcimg_verbosity=3 -Dcimg_strict_warnings -g | CIMG_DEBUG_CFLAGS = -Dcimg_verbosity=3 -Dcimg_strict_warnings -g | ||
− | |||
− | |||
Line 389: | Line 342: | ||
CIMG_VT100_CFLAGS = -Dcimg_use_vt100 | CIMG_VT100_CFLAGS = -Dcimg_use_vt100 | ||
− | |||
− | |||
Line 404: | Line 355: | ||
endif | endif | ||
− | |||
− | |||
Line 419: | Line 368: | ||
endif | endif | ||
− | |||
− | |||
Line 428: | Line 375: | ||
CIMG_OPENCV_LDFLAGS = -lcv -lhighgui | CIMG_OPENCV_LDFLAGS = -lcv -lhighgui | ||
− | |||
− | |||
Line 435: | Line 380: | ||
CIMG_NODISPLAY_CFLAGS = -Dcimg_display=0 | CIMG_NODISPLAY_CFLAGS = -Dcimg_display=0 | ||
− | |||
− | |||
Line 448: | Line 391: | ||
CIMG_X11_LDFLAGS = -L$(X11PATH)/lib -lpthread -lX11 | CIMG_X11_LDFLAGS = -L$(X11PATH)/lib -lpthread -lX11 | ||
− | |||
− | |||
Line 459: | Line 400: | ||
CIMG_XSHM_LDFLAGS = # -lXext | CIMG_XSHM_LDFLAGS = # -lXext | ||
− | |||
− | |||
− | |||
# Flags to enable GDI32 display (Windows native). | # Flags to enable GDI32 display (Windows native). | ||
Line 468: | Line 406: | ||
CIMG_GDI32_LDFLAGS = -lgdi32 | CIMG_GDI32_LDFLAGS = -lgdi32 | ||
− | |||
− | |||
Line 481: | Line 417: | ||
CIMG_XRANDR_LDFLAGS = -lXrandr | CIMG_XRANDR_LDFLAGS = -lXrandr | ||
− | |||
− | |||
Line 492: | Line 426: | ||
CIMG_PNG_LDFLAGS = -lpng -lz | CIMG_PNG_LDFLAGS = -lpng -lz | ||
− | |||
− | |||
Line 503: | Line 435: | ||
CIMG_JPEG_LDFLAGS = -ljpeg | CIMG_JPEG_LDFLAGS = -ljpeg | ||
− | |||
− | |||
Line 514: | Line 444: | ||
CIMG_TIFF_LDFLAGS = -ltiff | CIMG_TIFF_LDFLAGS = -ltiff | ||
− | |||
− | |||
Line 525: | Line 453: | ||
CIMG_MINC2_LDFLAGS = -lminc_io -lvolume_io2 -lminc2 -lnetcdf -lhdf5 -lz -L${HOME}/local/lib | CIMG_MINC2_LDFLAGS = -lminc_io -lvolume_io2 -lminc2 -lnetcdf -lhdf5 -lz -L${HOME}/local/lib | ||
− | |||
− | |||
Line 536: | Line 462: | ||
CIMG_EXR_LDFLAGS = -lIlmImf -lHalf | CIMG_EXR_LDFLAGS = -lIlmImf -lHalf | ||
− | |||
− | |||
Line 547: | Line 471: | ||
CIMG_FFMPEG_LDFLAGS = -lavcodec -lavformat -lswscale | CIMG_FFMPEG_LDFLAGS = -lavcodec -lavformat -lswscale | ||
− | |||
− | |||
Line 558: | Line 480: | ||
CIMG_ZLIB_LDFLAGS = -lz | CIMG_ZLIB_LDFLAGS = -lz | ||
− | |||
− | |||
Line 569: | Line 489: | ||
CIMG_MAGICK_LDFLAGS = `Magick++-config --ldflags` `Magick++-config --libs` | CIMG_MAGICK_LDFLAGS = `Magick++-config --ldflags` `Magick++-config --libs` | ||
− | |||
− | |||
Line 588: | Line 506: | ||
endif | endif | ||
− | |||
− | |||
Line 599: | Line 515: | ||
CIMG_LAPACK_LDFLAGS = -lblas -lg2c -llapack | CIMG_LAPACK_LDFLAGS = -lblas -lg2c -llapack | ||
− | |||
− | |||
Line 610: | Line 524: | ||
CIMG_BOARD_LDFLAGS = -lboard | CIMG_BOARD_LDFLAGS = -lboard | ||
− | |||
− | |||
Line 617: | Line 529: | ||
CIMG_SOLARIS_LDFLAGS = -R$(X11PATH)/lib -lrt -lnsl -lsocket | CIMG_SOLARIS_LDFLAGS = -R$(X11PATH)/lib -lrt -lnsl -lsocket | ||
− | |||
− | |||
Line 628: | Line 538: | ||
endif | endif | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
.cpp: | .cpp: | ||
Line 661: | Line 562: | ||
endif | endif | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
clean: | clean: | ||
Line 743: | Line 572: | ||
endif | endif | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
# Linux/BSD/Mac OSX targets, with X11 display. | # Linux/BSD/Mac OSX targets, with X11 display. | ||
Line 868: | Line 597: | ||
all | all | ||
+ | # MacOsX targets, with X11 display. | ||
+ | macosx: | ||
+ | @$(MAKE) \ | ||
− | + | "CONF_CFLAGS = \ | |
− | + | $(PROFILE) \ | |
− | + | $(PROFILE2) \ | |
$(CIMG_ANSI_CFLAGS) \ | $(CIMG_ANSI_CFLAGS) \ | ||
− | |||
− | |||
$(CIMG_VT100_CFLAGS) \ | $(CIMG_VT100_CFLAGS) \ | ||
− | $(CIMG_X11_CFLAGS | + | $(CIMG_X11_CFLAGS)" \ |
− | |||
− | |||
"CONF_LDFLAGS = \ | "CONF_LDFLAGS = \ | ||
− | $(CIMG_X11_LDFLAGS | + | $(CIMG_X11_LDFLAGS)" \ |
− | |||
− | |||
all | all | ||
+ | # Windows targets, with GDI32 display. | ||
− | + | windows: | |
− | |||
− | |||
@$(MAKE) \ | @$(MAKE) \ | ||
− | "CONF_CFLAGS = | + | "CONF_CFLAGS = " \ |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
"CONF_LDFLAGS = \ | "CONF_LDFLAGS = \ | ||
− | $( | + | $(CIMG_GDI32_LDFLAGS)" \ |
− | |||
− | |||
− | |||
− | |||
− | |||
all | all | ||
+ | </source> | ||
+ | === Makefile changes === | ||
+ | I have made changes to the original Makefile that was provided with this library. I have customized it to compile for profiling by default. A new build can be generate by cleaning the old build (eg: make clean). The source can be compiled by running make <platform> eg: macosx, linux. You will need to download a third party program to allow you to run the 'make' command on Windows but the execution is the same. it can also be run on windows using visual Studio or on the command line("cl.exe must be in the environnent PATH variable"). | ||
− | + | === Build Instructions === | |
− | + | This Makefile and application is cross-platform. | |
− | + | make <platform> eg: macosx, linux etc. | |
− | + | I have successfully built this application on the following platforms: Windows 7, OpenSuse linux and Mac OSX Lion. | |
− | |||
− | + | === Summary of Findings === | |
− | + | The execution of the program takes roughly 3.10 - 20 seconds (depending on how long you are measuring the calculations of triangle animations). it should be noted that this application initially was an application that relied upon user input for execution and for termination. I have modified this initial behavior by ensuring the while loop (which generates the triangles) executes only for a maximum of 1000 iterations. The time measured in this assignment is for every 1000 iterations of this loop. | |
− | |||
+ | === Profiling Results === | ||
+ | The results if the initial profile shows that the execution time is most greatly consumed when drawing the triangles out to the screen one at a time. It seems like this can be optimized by offloading this drawing to n threads based on n triangles to be drawn. But this is subject to change because of any additional complexity that may be introduced that may include interoperability with both the GPU and CPU. | ||
+ | There is another for loop which sets the dimensions for each triangle one by one in linear time O(n ). This process can also be out-sourced to the GPU in n threads for n triangles. I would need to determine if this process also involves interoperability between the CPU and GPU. | ||
+ | The complexity of the entire program is O(n^3). There is a for loop for setup, a while loop for accepting user input and another for loop for drawing the triangles. | ||
− | + | Also the times recorded can be increase if the maximum loop iterations increase ie: 10000,100000,1000000. This will identify the same relationship but with higher task time. | |
− | |||
− | + | === Potential Parallelization Candidates === | |
− | + | Upon analyzing this function I discovered two possible areas where I could optimize the code using threads sent to the GPU. The first is a for loop which sets the attributes for 100 triangles in serial. This task can be done in parallel using 100 threads on the GPU. | |
+ | <source lang="cpp"> | ||
− | + | for (int k = 0; k<100; ++k) { | |
− | + | posx[k] = (float)(cimg::rand()*img0.width()); | |
− | + | posy[k] = (float)(cimg::rand()*img0.height()); | |
− | + | rayon[k] = (float)(10 + cimg::rand()*50); | |
− | + | angle[k] = (float)(cimg::rand()*360); | |
− | + | veloc[k] = (float)(cimg::rand()*20 - 10); | |
− | + | color[k][0] = (unsigned char)(cimg::rand()*255); | |
− | + | color[k][1] = (unsigned char)(cimg::rand()*255); | |
− | + | color[k][2] = (unsigned char)(cimg::rand()*255); | |
− | + | opacity[k] = (float)(0.3 + 1.5*cimg::rand()); | |
− | + | } | |
− | + | </pre> | |
− | |||
− | |||
− | + | The second instance where this is possible is a bit tricky. It involves another serial for loop. The purpose of this loop is to draw each of the triangles on the screen and manipulate them later on. I am not 100 percent sure this can be done in parallel in practice but in theory it should be possible because the application is drawing out each triangle one by one. | |
− | + | <pre> | |
+ | // Draw each triangle on the background image. | ||
− | + | for (int k = 0; k<num; ++k) { | |
− | + | const int | |
− | + | x0 = (int)(posx[k] + rayon[k]*std::cos(angle[k]*cimg::PI/180)), | |
− | + | y0 = (int)(posy[k] + rayon[k]*std::sin(angle[k]*cimg::PI/180)), | |
− | + | x1 = (int)(posx[k] + rayon[k]*std::cos((angle[k] + 120)*cimg::PI/180)), | |
− | + | y1 = (int)(posy[k] + rayon[k]*std::sin((angle[k] + 120)*cimg::PI/180)), | |
− | + | x2 = (int)(posx[k] + rayon[k]*std::cos((angle[k] + 240)*cimg::PI/180)), | |
− | + | y2 = (int)(posy[k] + rayon[k]*std::sin((angle[k] + 240)*cimg::PI/180)); | |
− | + | if (k%10) img.draw_triangle(x0,y0,x1,y1,x2,y2,color[k],opacity[k]); | |
− | + | else img.draw_triangle(x0,y0,x1,y1,x2,y2,img0,0,0,img0.width()-1,0,0,img.height()-1,opacity[k]); | |
− | + | img.draw_triangle(x0,y0,x1,y1,x2,y2,white,opacity[k],~0U); | |
− | + | ||
+ | // Make the triangles rotate, and check for mouse click event. | ||
+ | // (to make triangles collapse or join). | ||
+ | angle[k]+=veloc[k]; | ||
− | + | if (disp.mouse_x()>0 && disp.mouse_y()>0) { | |
− | + | float u = disp.mouse_x() - posx[k], v = disp.mouse_y() - posy[k]; | |
− | + | if (disp.button()) { u = -u; v = -v; } | |
− | + | posx[k]-=0.03f*u, posy[k]-=0.03f*v; | |
− | + | if (posx[k]<0 || posx[k]>=img.width()) posx[k] = (float)(cimg::rand()*img.width()); | |
− | + | if (posy[k]<0 || posy[k]>=img.height()) posy[k] = (float)(cimg::rand()*img.height()); | |
− | + | } | |
− | + | } | |
− | + | ||
− | + | </source> | |
− | + | === Amdahls Law Calculations === | |
− | + | Since there are 100 Triangles generated then we can theoretically create 100 threads for each triangle. The draw_line, draw_triangle, and draw_image functions take up 16 percent(0.38 + 0.08 + 0.06 / 3.10) of the execution time of the application in seconds. Plugging that into the equation using 100 cores we get: | |
− | + | S100 = 1/ 1 - 0.16 + 0.16 / 100 | |
+ | = 1.18 or 1.2 speedup is theoretically achievable rounded up PER 1000 iterations of the while loop to draw these triangles. | ||
+ | Will I work on this Project? | ||
+ | If I can optimize this function or any other function within the CImg library I will continue with this project. If it is not possible to optimize this project within the given time of the course then it will be difficult to continue on with this project and I will have to work with someone else's project. But my initial plan is to continue with this project unless I am told otherwise. | ||
+ | === Issues Encountered === | ||
− | + | '''Gprof on MacOSx with Intel processor''' | |
− | |||
− | + | The profiling tool gprog does not work on the macbooks with an Intel processor installed (I have Intel Core i5). This was verified by numerous internet resources and annoying personal experience. | |
− | |||
− | + | '''Makefile builds EVERYTHING''' | |
− | + | The Makefile shipped with this open source library is programmed to build all of the modules located in CImg_demo.cpp. I had to modify the Makefile to only build the specified function within the library of Image processing functions | |
− | + | == Assignment 2 == | |
− | + | This assignment involved GPU programming the functions I had speculated I could process concurrently. After further analysis I determined that the second for loop cannot be processed in parallel without converting some underlying API functions to device functions. These functions are used in multiple areas in the framework so I did not pursue changing the framework too much. The first for loop however contained independent data that could be executed in parallel so I went ahead and created kernels for that code. I also noticed that the function made a call to cimg::rand(). This is obviously a function that returns random numbers. Luckily CUDA also has multiple libraries that perform these functions. Off the top of my head: Thrust and CURAND do. I went with CURAND because it has the word "rand" in it and that's what I need. hehe | |
+ | === Kernels === | ||
+ | <source lang="cpp"> | ||
− | |||
− | + | /* | |
− | + | * Setup and initialize curand with a seed | |
− | + | */ | |
− | + | __global__ void initCurand(curandState* state){ | |
+ | int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
+ | curand_init(100, idx, 0, &state[idx]); | ||
+ | __syncthreads(); | ||
− | + | } | |
− | |||
− | |||
− | + | /* | |
− | + | * CUDA kernel that will execute 100 threads in parallel | |
− | + | * and will populate these parallel arrays with 100 random numbers | |
− | + | * array size = 100. | |
− | + | */ | |
− | |||
− | |||
− | + | __global__ void initializeArrays(float* posx, float* posy,float* rayon, float* veloc, | |
− | + | float* opacity ,float* angle, unsigned char* color, int height, | |
− | + | int width, curandState* state, size_t pitch){ | |
− | |||
+ | int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
+ | curandState localState = state[idx]; | ||
− | |||
− | |||
− | + | posx[idx] = (float)(curand_normal(&localState)*width); | |
− | + | posy[idx] = (float)(curand_normal(&localState)*height); | |
− | + | rayon[idx] = (float)(10 + curand_normal(&localState)*50); | |
− | + | angle[idx] = (float)(curand_normal(&localState)*360); | |
− | + | veloc[idx] = (float)(curand_uniform(&localState)*20 - 10); | |
− | + | color[idx*pitch] = (unsigned char)(curand_normal(&localState)*255); | |
+ | color[(idx*pitch)+1] = (unsigned char)(curand_normal(&localState)*255); | ||
+ | color[(idx*pitch)+2] = (unsigned char)(curand_normal(&localState)*255); | ||
+ | opacity[idx] = (float)(0.3f + 1.5f *curand_normal(&localState)); | ||
− | + | ||
− | + | __syncthreads(); | |
− | + | } | |
− | + | // check for any errors returned by CUDA API functions. | |
− | + | void errCheck(cudaError_t err, const char* msg){ | |
− | + | if (err != cudaSuccess) | |
− | + | std::cout<< msg << ": " << cudaGetErrorString(err) << std::endl; | |
− | + | } | |
− | + | </source> | |
− | + | === Body Code === | |
+ | <source lang="cpp"> | ||
− | |||
− | + | // check for any errors returned by CUDA API functions. | |
− | + | void errCheck(cudaError_t err, const char* msg){ | |
− | + | if (err != cudaSuccess) | |
− | + | std::cout<< msg << ": " << cudaGetErrorString(err) << std::endl; | |
− | + | } | |
− | |||
− | |||
− | + | /*--------------------------- | |
− | |||
− | |||
− | + | Main procedure | |
− | |||
− | |||
− | + | --------------------------*/ | |
− | + | int main() { | |
− | |||
− | |||
− | + | // Create a colored 640x480 background image which consists of different color shades. | |
− | + | CImg<float> background(640,480,1,3); | |
− | + | cimg_forXY(background,x,y) background.fillC(x,y,0, | |
− | + | x*std::cos(6.0*y/background.height()) + y*std::sin(9.0*x/background.width()), | |
− | + | x*std::sin(8.0*y/background.height()) - y*std::cos(11.0*x/background.width()), | |
+ | x*std::cos(13.0*y/background.height()) - y*std::sin(8.0*x/background.width())); | ||
+ | background.normalize(0,180); | ||
+ | |||
− | + | // Init images and create display window. | |
− | + | CImg<unsigned char> img0(background), img; | |
− | + | unsigned char white[] = { 255, 255, 255 }, color[100][3]; | |
− | " | + | CImgDisplay disp(img0,"[#6] - Filled Triangles (Click to shrink)"); |
− | + | // error handling | |
− | + | cudaError_t err; | |
− | + | // Define random properties (pos, size, colors, ..) for all triangles that will be displayed. | |
− | + | float posx[100]; | |
− | + | float posy[100]; | |
− | + | float rayon[100]; | |
− | + | float angle[100]; | |
− | + | float veloc[100]; | |
+ | float opacity[100]; | ||
+ | // Define the same properties but for the device | ||
+ | float* d_posx; | ||
− | + | float* d_posy; | |
− | + | float* d_rayon; | |
− | + | float* d_angle; | |
− | + | float* d_veloc; | |
− | + | float* d_opacity; | |
− | + | unsigned char* d_color; | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | // CURAND state | |
− | + | curandState* devState; | |
− | |||
− | |||
− | + | // allocate memory on the device for the device arrays | |
− | + | err = cudaMalloc((void**)&d_posx, 100 * sizeof(float)); | |
− | + | errCheck(err, "cudaMalloc((void**)&d_posx, 100 * sizeof(float))"); | |
− | + | err = cudaMalloc((void**)&d_posy, 100 * sizeof(float)); | |
+ | errCheck(err,"cudaMalloc((void**)&d_posy, 100 * sizeof(float))"); | ||
+ | err = cudaMalloc((void**)&d_rayon, 100 * sizeof(float)); | ||
+ | errCheck(err,"cudaMalloc((void**)&d_rayon, 100 * sizeof(float))"); | ||
− | + | err = cudaMalloc((void**)&d_angle, 100 * sizeof(float)); | |
− | + | errCheck(err,"cudaMalloc((void**)&d_angle, 100 * sizeof(float))"); | |
− | + | err = cudaMalloc((void**)&d_veloc, 100 * sizeof(float)); | |
− | + | errCheck(err,"cudaMalloc((void**)&d_veloc, 100 * sizeof(float))"); | |
− | + | err = cudaMalloc((void**)&d_opacity, 100 * sizeof(float)); | |
− | + | errCheck(err,"cudaMalloc((void**)&d_opacity, 100 * sizeof(float))"); | |
− | + | err = cudaMalloc((void**)&devState, 100*sizeof(curandState)); | |
+ | errCheck(err,"cudaMalloc((void**)&devState, 100*sizeof(curandState))"); | ||
+ | size_t pitch; | ||
+ | //allocated the device memory for source array | ||
− | + | err = cudaMallocPitch(&d_color, &pitch, 3 * sizeof(unsigned char),100); | |
− | + | errCheck(err,"cudaMallocPitch(&d_color, &pitch, 3 * sizeof(unsigned char),100)"); | |
− | + | // launch grid of threads | |
− | + | dim3 dimBlock(100); | |
− | + | dim3 dimGrid(1); | |
− | + | ||
− | + | /* Kernel for initializing CURAND */ | |
− | + | initCurand<<<1,100>>>(devState); | |
− | |||
− | |||
− | + | // synchronize the device and the host | |
− | + | cudaDeviceSynchronize(); | |
− | + | ||
− | + | /*Kernel for initializing Arrays */ | |
− | + | initializeArrays<<<1, 100>>>(d_posx, d_posy, d_rayon, d_veloc, d_opacity, d_angle, | |
− | + | d_color, img0.height(), img0.width(), devState, pitch); | |
− | + | // synchronize the device and the host | |
− | + | cudaDeviceSynchronize(); | |
− | + | ||
− | + | // get the populated arrays back to the host for use | |
− | + | err = cudaMemcpy(posx,d_posx, 100 * sizeof(float), cudaMemcpyDeviceToHost); | |
− | + | errCheck(err,"cudaMemcpy(posx,d_posx, 100 * sizeof(float), cudaMemcpyDeviceToHost)"); | |
− | + | err = cudaMemcpy(posy,d_posy, 100 * sizeof(float), cudaMemcpyDeviceToHost); | |
− | + | errCheck(err,"cudaMemcpy(posy,d_posy, 100 * sizeof(float), cudaMemcpyDeviceToHost)"); | |
− | + | err = cudaMemcpy(rayon,d_rayon, 100 * sizeof(float), cudaMemcpyDeviceToHost); | |
− | + | errCheck(err,"cudaMemcpy(rayon,d_rayon, 100 * sizeof(float), cudaMemcpyDeviceToHost)"); | |
− | + | err = cudaMemcpy(veloc,d_veloc, 100 * sizeof(float), cudaMemcpyDeviceToHost); | |
+ | errCheck(err,"cudaMemcpy(veloc,d_veloc, 100 * sizeof(float), cudaMemcpyDeviceToHost)"); | ||
+ | err = cudaMemcpy(opacity,d_opacity, 100 * sizeof(float), cudaMemcpyDeviceToHost); | ||
+ | errCheck(err,"cudaMemcpy(opacity,d_opacity, 100 * sizeof(float), cudaMemcpyDeviceToHost)"); | ||
− | + | err = cudaMemcpy(angle,d_angle, 100 * sizeof(float), cudaMemcpyDeviceToHost); | |
− | + | errCheck(err,"cudaMemcpy(angle,d_angle, 100 * sizeof(float), cudaMemcpyDeviceToHost)"); | |
− | + | // pitch of color array is 3+1 padded | |
− | + | err = cudaMemcpy2D(color,4,d_color,pitch,3 *sizeof(unsigned char),3, cudaMemcpyDeviceToHost); | |
− | " | + | errCheck(err,"cudaMemcpy2D(color,pitch,d_color,100*3,3 *sizeof(unsigned char),100* sizeof(unsigned char), cudaMemcpyDeviceToHost)"); |
− | + | // measuring time it takes for triangle animations in 1000 iterations | |
− | + | int i = 0, num = 1; | |
+ | |||
+ | // Start animation loop. | ||
+ | while (!disp.is_closed() && !disp.is_keyQ() && !disp.is_keyESC() && i < 1000) { | ||
− | + | img = img0; | |
− | + | ||
− | + | i++; | |
− | + | // Draw each triangle on the background image. | |
− | + | for (int k = 0; k<num; ++k) { | |
− | + | const int | |
− | + | x0 = (int)(posx[k] + rayon[k]*std::cos(angle[k]*cimg::PI/180)), | |
+ | y0 = (int)(posy[k] + rayon[k]*std::sin(angle[k]*cimg::PI/180)), | ||
+ | x1 = (int)(posx[k] + rayon[k]*std::cos((angle[k] + 120)*cimg::PI/180)), | ||
+ | y1 = (int)(posy[k] + rayon[k]*std::sin((angle[k] + 120)*cimg::PI/180)), | ||
− | + | x2 = (int)(posx[k] + rayon[k]*std::cos((angle[k] + 240)*cimg::PI/180)), | |
− | + | y2 = (int)(posy[k] + rayon[k]*std::sin((angle[k] + 240)*cimg::PI/180)); | |
− | + | if (k%10) img.draw_triangle(x0,y0,x1,y1,x2,y2,color[k],opacity[k]); | |
− | + | else img.draw_triangle(x0,y0,x1,y1,x2,y2,img0,0,0,img0.width()-1,0,0,img.height()-1,opacity[k]); | |
− | + | img.draw_triangle(x0,y0,x1,y1,x2,y2,white,opacity[k],~0U); | |
− | + | ||
− | + | // Make the triangles rotate, and check for mouse click event. | |
− | + | // (to make triangles collapse or join). | |
+ | angle[k]+=veloc[k]; | ||
+ | if (disp.mouse_x()>0 && disp.mouse_y()>0) { | ||
+ | float u = disp.mouse_x() - posx[k], v = disp.mouse_y() - posy[k]; | ||
− | + | if (disp.button()) { u = -u; v = -v; } | |
− | + | posx[k]-=0.03f*u, posy[k]-=0.03f*v; | |
− | + | if (posx[k]<0 || posx[k]>=img.width()) posx[k] = (float)(cimg::rand()*img.width()); | |
− | + | if (posy[k]<0 || posy[k]>=img.height()) posy[k] = (float)(cimg::rand()*img.height()); | |
− | + | } | |
− | + | } | |
− | + | ||
+ | // Display current animation framerate, and refresh display window. | ||
+ | img.draw_text(5,5,"%u frames/s",white,0,0.5f,13,(unsigned int)disp.frames_per_second()); | ||
+ | img0.resize(disp.display(img).resize(false).wait(20)); | ||
− | + | if (++num>100) num = 100; | |
− | + | ||
− | + | // Allow the user to toggle fullscreen mode, by pressing CTRL+F. | |
− | + | if (disp.is_keyCTRLLEFT() && disp.is_keyF()) disp.resize(640,480,false).toggle_fullscreen(false); | |
− | + | } | |
− | |||
− | |||
− | + | // free allocated device memory | |
− | + | cudaFree(d_posy); | |
− | + | cudaFree(d_posx); | |
− | + | cudaFree(d_rayon); | |
− | + | cudaFree(d_veloc); | |
− | + | cudaFree(d_opacity); | |
− | + | cudaFree(d_color); | |
− | + | cudaFree(d_angle); | |
− | + | cudaFree(devState); | |
− | + | return 0; | |
− | + | } | |
− | |||
− | + | </source> | |
+ | === Profiling Results === | ||
+ | The entirety of the GPU code executes in 0.15 - 0.16 seconds. I profiled the application using nvvp and NSight on Visual Studio 2010. | ||
+ | === Issues Encountered === | ||
+ | Makefile does not work on the Windows platform. Tried adding the -lcurand linker flag but to no avail. | ||
+ | The code does compile cleanly without error on Visual Studio 2010 IDE. | ||
− | + | Coding issues: | |
− | + | Stackoverflow Questions I made: | |
− | + | [http://stackoverflow.com/questions/15245723/using-arrayij-in-a-cuda-kernel-memcpy-call Using 2D Arrays] | |
+ | [http://stackoverflow.com/questions/15238009/concurrently-initializing-many-arrays-with-random-numbers-using-curand-and-cuda Memory Issues] | ||
− | |||
− | |||
− | |||
− | + | With the help of the open-source community I was able to solve my problems and gain a better understanding of CUDA and CURAND. | |
== Assignment 3 == | == Assignment 3 == |
Latest revision as of 02:31, 8 March 2013
Top Solo
Contents
- 1 Overview
- 2 Assignment 1
- 2.1 The Program - Filled Triangles
- 2.2 Initial Profile (on OpenSUSE Linux at Seneca)
- 2.3 Modified Makefile for Profiling
- 2.4 Makefile changes
- 2.5 Build Instructions
- 2.6 Summary of Findings
- 2.7 Profiling Results
- 2.8 Potential Parallelization Candidates
- 2.9 Amdahls Law Calculations
- 2.10 Issues Encountered
- 3 Assignment 2
- 4 Assignment 3
Overview
team name - classical DoTA reference
CImage is a free and open source cross platform Image Processing solution written in C++:
Links Supporting Project:
Assignment 1
I am going to try and profile and optimize the following function from the CImg Library. This function creates triangles on the screen and allows the user to make them clump up by clicking the mouse buttons.
This function, like others in the CImg_demo.cpp library uses the <CImg.h> library. This is a massive library that contains functions and routines responsible for displaying and preparing the environment for Image and Video processing. My function 'Filled Triangles' uses this library and its resources extensively.
Compilation with profiling on Mac OSX:
g++ -o CImg_demo CImg_demo.cpp -O2 -g -pg -I.. -Wall -W -ansi -pedantic -Dcimg_use_vt100 -I/usr/X11R6/include -lm -L/usr/X11R6/lib -lpthread -lX11
The Program - Filled Triangles
// Item : Filled Triangles
//-------------------------
void* item_filled_triangles() {
// Size
const int SIZE = 100;
// Create a colored 640x480 background image which consists of different color shades.
CImg<float> background(640,480,1,3);
cimg_forXY(background,x,y) background.fillC(x,y,0,
x*std::cos(6.0*y/background.height()) + y*std::sin(9.0*x/background.width()),
x*std::sin(8.0*y/background.height()) - y*std::cos(11.0*x/background.width()),
x*std::cos(13.0*y/background.height()) - y*std::sin(8.0*x/background.width()));
background.normalize(0,180);
// Init images and create display window.
CImg<unsigned char> img0(background), img;
unsigned char white[] = { 255, 255, 255 }, color[100][3];
CImgDisplay disp(img0,"[#6] - Filled Triangles (Click to shrink)");
// Define random properties (pos, size, colors, ..) for all triangles that will be displayed.
float posx[SIZE], posy[SIZE], rayon[SIZE], angle[SIZE], veloc[SIZE], opacity[SIZE];
int num = 1;
std::srand((unsigned int)time(0));
// I'm thinking of offloading these operations to the GPU
for (int k = 0; k<SIZE; ++k) {
posx[k] = (float)(cimg::rand()*img0.width());
posy[k] = (float)(cimg::rand()*img0.height());
rayon[k] = (float)(10 + cimg::rand()*50);
angle[k] = (float)(cimg::rand()*360);
veloc[k] = (float)(cimg::rand()*20 - 10);
color[k][0] = (unsigned char)(cimg::rand()*255);
color[k][1] = (unsigned char)(cimg::rand()*255);
color[k][2] = (unsigned char)(cimg::rand()*255);
opacity[k] = (float)(0.3 + 1.5*cimg::rand());
}
// Start animation loop.
while (!disp.is_closed() && !disp.is_keyQ() && !disp.is_keyESC()) {
img = img0;
/* Maybe offload this for loop to GPU? */
// Draw each triangle on the background image.
for (int k = 0; k<num; ++k) {
const int
x0 = (int)(posx[k] + rayon[k]*std::cos(angle[k]*cimg::PI/180)),
y0 = (int)(posy[k] + rayon[k]*std::sin(angle[k]*cimg::PI/180)),
x1 = (int)(posx[k] + rayon[k]*std::cos((angle[k] + 120)*cimg::PI/180)),
y1 = (int)(posy[k] + rayon[k]*std::sin((angle[k] + 120)*cimg::PI/180)),
x2 = (int)(posx[k] + rayon[k]*std::cos((angle[k] + 240)*cimg::PI/180)),
y2 = (int)(posy[k] + rayon[k]*std::sin((angle[k] + 240)*cimg::PI/180));
if (k%10) img.draw_triangle(x0,y0,x1,y1,x2,y2,color[k],opacity[k]);
else img.draw_triangle(x0,y0,x1,y1,x2,y2,img0,0,0,img0.width()-1,0,0,img.height()-1,opacity[k]);
img.draw_triangle(x0,y0,x1,y1,x2,y2,white,opacity[k],~0U);
// Make the triangles rotate, and check for mouse click event.
// (to make triangles collapse or join).
angle[k]+=veloc[k];
if (disp.mouse_x()>0 && disp.mouse_y()>0) {
float u = disp.mouse_x() - posx[k], v = disp.mouse_y() - posy[k];
if (disp.button()) { u = -u; v = -v; }
posx[k]-=0.03f*u, posy[k]-=0.03f*v;
if (posx[k]<0 || posx[k]>=img.width()) posx[k] = (float)(cimg::rand()*img.width());
if (posy[k]<0 || posy[k]>=img.height()) posy[k] = (float)(cimg::rand()*img.height());
}
}
// Display current animation framerate, and refresh display window.
img.draw_text(5,5,"%u frames/s",white,0,0.5f,13,(unsigned int)disp.frames_per_second());
img0.resize(disp.display(img).resize(false).wait(20));
if (++num>SIZE) num = SIZE;
// Allow the user to toggle fullscreen mode, by pressing CTRL+F.
if (disp.is_keyCTRLLEFT() && disp.is_keyF()) disp.resize(640,480,false).toggle_fullscreen(false);
}
return 0;
}
Initial Profile (on OpenSUSE Linux at Seneca)
Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls us/call us/call name 82.26 2.55 2.55 4820368 0.53 0.53 frame_dummy 12.26 2.93 0.38 cimg_library::CImg<unsigned char>& cimg_library::CImg<unsigned char>::draw_line<unsigned char>(int, int, int, int, unsigned char const*, float, unsigned int, bool) 2.58 3.01 0.08 10965 7.30 7.33 cimg_library::CImg<unsigned char>& cimg_library::CImg<unsigned char>::draw_image<float, float>(int, int, int, int, cimg_library::CImg<float> const&, cimg_library::CImg<float> const&, float, float) 1.94 3.07 0.06 cimg_library::CImg<unsigned char>& cimg_library::CImg<unsigned char>::_draw_triangle<unsigned char>(int, int, int, int, int, int, unsigned char const*, float, float) 0.32 3.08 0.01 298115 0.03 0.03 cimg_library::CImg<unsigned char>::is_empty() const 0.32 3.09 0.01 item_3d_reflection() 0.32 3.10 0.01 cimg_library::CImg<float>::fillC(unsigned int, unsigned int, unsigned int, double, ...) 0.00 3.10 0.00 14040 0.00 0.00 cimg_library::CImg<float>::assign(unsigned int, unsigned int, unsigned int, unsigned int) 0.00 3.10 0.00 13270 0.00 0.00 cimg_library::CImg<float>::assign(float const*, unsigned int, unsigned int, unsigned int, unsigned int) 0.00 3.10 0.00 9053 0.00 0.00 cimg_library::cimg::X11_attr() 0.00 3.10 0.00 7130 0.00 0.00 cimg_library::CImg<float>::~CImg() 0.00 3.10 0.00 2305 0.00 0.00 cimg_library::CImg<float>::move_to(cimg_library::CImg<float>&) 0.00 3.10 0.00 1793 0.00 0.00 cimg_library::CImg<float>& cimg_library::CImg<float>::assign<float>(cimg_library::CImg<float> const&, bool) 0.00 3.10 0.00 1024 0.00 0.00 cimg_library::CImg<unsigned long>::CImg(unsigned int, unsigned int, unsigned int, unsigned int) 0.00 3.10 0.00 1000 0.00 0.53 cimg_library::CImgDisplay& cimg_library::CImgDisplay::render<unsigned char>(cimg_library::CImg<unsigned char> const&, bool) 0.00 3.10 0.00 1000 0.00 80.40 cimg_library::CImg<unsigned char>& cimg_library::CImg<unsigned char>::_draw_text<unsigned char, unsigned char, float>(int, int, char const*, unsigned char const*, unsigned char const*, float, cimg_library::CImgList<float> const&) 0.00 3.10 0.00 1000 0.00 0.00 cimg_library::CImg<unsigned char>::assign(unsigned int, unsigned int, unsigned int, unsigned int) 0.00 3.10 0.00 1000 0.00 81.11 cimg_library::CImg<unsigned char>& cimg_library::CImg<unsigned char>::draw_text<unsigned char, unsigned char>(int, int, char const*, unsigned char const*, unsigned char const*, float, unsigned int, ...) 0.00 3.10 0.00 769 0.00 0.70 cimg_library::CImg<float>::resize(int, int, int, int, int, unsigned int, float, float, float, float) 0.00 3.10 0.00 769 0.00 0.00 cimg_library::CImg<float>::CImg(cimg_library::CImg<float> const&) 0.00 3.10 0.00 769 0.00 0.70 cimg_library::CImg<float>::get_resize(int, int, int, int, int, unsigned int, float, float, float, float) const 0.00 3.10 0.00 768 0.00 0.00 cimg_library::CImg<unsigned long>::fill(unsigned long) 0.00 3.10 0.00 702 0.00 0.00 cimg_library::CImg<float>::draw_image(int, int, int, int, cimg_library::CImg<float> const&, float) 0.00 3.10 0.00 513 0.00 0.00 cimg_library::CImg<float>::assign(unsigned int, unsigned int, unsigned int, unsigned int, float) 0.00 3.10 0.00 512 0.00 0.00 cimg_library::CImgList<float>::insert(cimg_library::CImg<float> const&, unsigned int, bool) 0.00 3.10 0.00 189 0.00 0.00 cimg_library::CImg<float>::CImg(unsigned int, unsigned int, unsigned int, unsigned int) 0.00 3.10 0.00 189 0.00 0.00 cimg_library::CImg<float>::get_crop(int, int, int, int, int, int, int, int, bool) const 0.00 3.10 0.00 67 0.00 0.00 cimg_library::CImg<float>::CImg(unsigned int, unsigned int, unsigned int, unsigned int, float) 0.00 3.10 0.00 6 0.00 0.00 cimg_library::CImgList<float>::assign(unsigned int) 0.00 3.10 0.00 3 0.00 0.35 cimg_library::CImgDisplay::_handle_events(_XEvent const*) 0.00 3.10 0.00 2 0.00 0.00 cimg_library::CImgList<float>::assign(cimg_library::CImgList<float> const&, bool) 0.00 3.10 0.00 2 0.00 0.00 cimg_library::CImgList<float>::~CImgList() 0.00 3.10 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z22item_blurring_gradientv 0.00 3.10 0.00 1 0.00 0.00 cimg_library::CImgDisplay::_map_window() 0.00 3.10 0.00 1 0.00 0.00 cimg_library::CImgDisplay::_assign(unsigned int, unsigned int, char const*, unsigned int, bool, bool) 0.00 3.10 0.00 1 0.00 0.00 cimg_library::CImg<unsigned char>::~CImg() 0.00 3.10 0.00 1 0.00 136.13 cimg_library::CImgList<float>::font(unsigned int, bool) 0.00 3.10 0.00 1 0.00 136.13 cimg_library::CImgList<float>::_font(unsigned int const*, unsigned int, unsigned int, bool) 0.00 3.10 0.00 1 0.00 0.00 cimg_library::CImgList<float>& cimg_library::CImgList<float>::insert<float>(cimg_library::CImgList<float> const&, unsigned int, bool) 0.00 3.10 0.00 1 0.00 0.00 cimg_library::CImgList<float>::CImgList(cimg_library::CImgList<float> const&) 0.00 3.10 0.00 1 0.00 136.13 cimg_library::CImgList<float>::get_crop_font() const
Modified Makefile for Profiling
CIMG_FILES = CImg_demo
# Files which requires external libraries to run.
CIMG_EXTRA_FILES = use_jpeg_buffer \
gmic_gimp \
gmic
CIMG_VERSION = 1.5.4
X11PATH = /usr/X11R6
CC = g++
EXEPFX =
CCVER = $(CC)
ifeq ($(CC),g++)
CCVER = `$(CC) -v 2>&1 | tail -n 1`
endif
ifeq ($(CC),clang++)
CCVER = `$(CC) -v 2>&1 | head -n 1`
endif
ifeq ($(CC),icc)
CCVER = "icc \( `$(CC) -v 2>&1`\)"
CFLAGS = -I..
LDFLAGS =
else
# ADDED PROFILING CALLS HERE #
'''CFLAGS = -I.. -Wall -W -O2 -g -pg'''
'''LDFLAGS = -lm -pg'''
endif
# Flags to enable strict code standards
ifeq ($(CC),icc)
CIMG_ANSI_CFLAGS = -ansi
else
CIMG_ANSI_CFLAGS = -ansi -pedantic
endif
# Flags to enable code debugging.
CIMG_DEBUG_CFLAGS = -Dcimg_verbosity=3 -Dcimg_strict_warnings -g
# Flags to enable color output messages.
# (requires a VT100 compatible terminal)
CIMG_VT100_CFLAGS = -Dcimg_use_vt100
# Flags to enable code optimization by the compiler.
ifeq ($(CC),icc)
CIMG_OPT_CFLAGS = -O3 -ipo -no-prec-div
else
CIMG_OPT_CFLAGS = -O3 -fno-tree-pre
endif
# Flags to enable OpenMP support.
ifeq ($(CC),icc)
CIMG_OPENMP_CFLAGS = -Dcimg_use_openmp -openmp -i-static
else
CIMG_OPENMP_CFLAGS = -Dcimg_use_openmp -fopenmp
endif
# Flags to enable OpenCV support.
CIMG_OPENCV_CFLAGS = -Dcimg_use_opencv -I/usr/include/opencv
CIMG_OPENCV_LDFLAGS = -lcv -lhighgui
# Flags used to disable display capablities of CImg
CIMG_NODISPLAY_CFLAGS = -Dcimg_display=0
# Flags to enable the use of the X11 library.
# (X11 is used by CImg to handle display windows)
# !!! For 64bits systems : replace -L$(X11PATH)/lib by -L$(X11PATH)/lib64 !!!
CIMG_X11_CFLAGS = -I$(X11PATH)/include
CIMG_X11_LDFLAGS = -L$(X11PATH)/lib -lpthread -lX11
# Flags to enable fast image display, using the XSHM library (when using X11).
# !!! Seems to randomly crash when used on MacOSX and 64bits systems, so use it only when necessary !!!
CIMG_XSHM_CFLAGS = # -Dcimg_use_xshm
CIMG_XSHM_LDFLAGS = # -lXext
# Flags to enable GDI32 display (Windows native).
CIMG_GDI32_CFLAGS = -mwindows
CIMG_GDI32_LDFLAGS = -lgdi32
# Flags to enable screen mode switching, using the XRandr library (when using X11).
# ( http://www.x.org/wiki/Projects/XRandR )
# !!! Not supported by the X11 server on MacOSX, so do not use it on MacOSX !!!
CIMG_XRANDR_CFLAGS = -Dcimg_use_xrandr
CIMG_XRANDR_LDFLAGS = -lXrandr
# Flags to enable native support for PNG image files, using the PNG library.
# ( http://www.libpng.org/ )
CIMG_PNG_CFLAGS = -Dcimg_use_png
CIMG_PNG_LDFLAGS = -lpng -lz
# Flags to enable native support for JPEG image files, using the JPEG library.
# ( http://www.ijg.org/ )
CIMG_JPEG_CFLAGS = -Dcimg_use_jpeg
CIMG_JPEG_LDFLAGS = -ljpeg
# Flags to enable native support for TIFF image files, using the TIFF library.
# ( http://www.libtiff.org/ )
CIMG_TIFF_CFLAGS = -Dcimg_use_tiff
CIMG_TIFF_LDFLAGS = -ltiff
# Flags to enable native support for MINC2 image files, using the MINC2 library.
# ( http://en.wikibooks.org/wiki/MINC/Reference/MINC2.0_Users_Guide )
CIMG_MINC2_CFLAGS = -Dcimg_use_minc2 -I${HOME}/local/include
CIMG_MINC2_LDFLAGS = -lminc_io -lvolume_io2 -lminc2 -lnetcdf -lhdf5 -lz -L${HOME}/local/lib
# Flags to enable native support for EXR image files, using the OpenEXR library.
# ( http://www.openexr.com/ )
CIMG_EXR_CFLAGS = -Dcimg_use_openexr -I/usr/include/OpenEXR
CIMG_EXR_LDFLAGS = -lIlmImf -lHalf
# Flags to enable native support for various video files, using the FFMPEG library.
# ( http://www.ffmpeg.org/ )
CIMG_FFMPEG_CFLAGS = -Dcimg_use_ffmpeg -D__STDC_CONSTANT_MACROS -I/usr/include/libavcodec -I/usr/include/libavformat -I/usr/include/libswscale -I/usr/include/ffmpeg
CIMG_FFMPEG_LDFLAGS = -lavcodec -lavformat -lswscale
# Flags to enable native support for compressed .cimgz files, using the Zlib library.
# ( http://www.zlib.net/ )
CIMG_ZLIB_CFLAGS = -Dcimg_use_zlib
CIMG_ZLIB_LDFLAGS = -lz
# Flags to enable native support of most classical image file formats, using the Magick++ library.
# ( http://www.imagemagick.org/Magick++/ )
CIMG_MAGICK_CFLAGS = -Dcimg_use_magick `Magick++-config --cppflags` `Magick++-config --cxxflags`
CIMG_MAGICK_LDFLAGS = `Magick++-config --ldflags` `Magick++-config --libs`
# Flags to enable faster Discrete Fourier Transform computation, using the FFTW3 library
# ( http://www.fftw.org/ )
CIMG_FFTW3_CFLAGS = -Dcimg_use_fftw3
ifeq ($(MSYSTEM),MINGW32)
CIMG_FFTW3_LDFLAGS = -lfftw3-3
else
CIMG_FFTW3_LDFLAGS = -lfftw3
endif
# Flags to enable the use of LAPACK routines for matrix computation
# ( http://www.netlib.org/lapack/ )
CIMG_LAPACK_CFLAGS = -Dcimg_use_lapack
CIMG_LAPACK_LDFLAGS = -lblas -lg2c -llapack
# Flags to enable the use of the Board library
# ( http://libboard.sourceforge.net/ )
CIMG_BOARD_CFLAGS = -Dcimg_use_board -I/usr/include/board
CIMG_BOARD_LDFLAGS = -lboard
# Flags to compile on Sun Solaris
CIMG_SOLARIS_LDFLAGS = -R$(X11PATH)/lib -lrt -lnsl -lsocket
# Flags to compile GIMP plug-ins.
ifeq ($(MSYSTEM),MINGW32)
CIMG_GIMP_CFLAGS = -mwindows
endif
.cpp:
@echo
@echo "** Compiling '$* ($(CIMG_VERSION))' with '$(CCVER)'"
@echo
$(CC) -o $(EXEPFX)$* $< $(CFLAGS) $(CONF_CFLAGS) $(LDFLAGS) $(CONF_LDFLAGS)
ifeq ($(STRIP_EXE),true)
ifeq ($(MSYSTEM),MINGW32)
strip $(EXEPFX)$*.exe
else
strip $(EXEPFX)$*
endif
endif
clean:
rm -rf *.exe *.o *~ \#* $(CIMG_FILES) $(CIMG_EXTRA_FILES)
ifneq ($(EXEPFX),)
rm -f $(EXEPFX)*
endif
# Linux/BSD/Mac OSX targets, with X11 display.
linux:
@$(MAKE) \
"CONF_CFLAGS = \
$(CIMG_ANSI_CFLAGS) \
$(CIMG_VT100_CFLAGS) \
$(CIMG_X11_CFLAGS) \
$(CIMG_XSHM_CFLAGS)" \
"CONF_LDFLAGS = \
$(CIMG_X11_LDFLAGS) \
$(CIMG_XSHM_LDFLAGS)" \
all
# MacOsX targets, with X11 display.
macosx:
@$(MAKE) \
"CONF_CFLAGS = \
$(PROFILE) \
$(PROFILE2) \
$(CIMG_ANSI_CFLAGS) \
$(CIMG_VT100_CFLAGS) \
$(CIMG_X11_CFLAGS)" \
"CONF_LDFLAGS = \
$(CIMG_X11_LDFLAGS)" \
all
# Windows targets, with GDI32 display.
windows:
@$(MAKE) \
"CONF_CFLAGS = " \
"CONF_LDFLAGS = \
$(CIMG_GDI32_LDFLAGS)" \
all
Makefile changes
I have made changes to the original Makefile that was provided with this library. I have customized it to compile for profiling by default. A new build can be generate by cleaning the old build (eg: make clean). The source can be compiled by running make <platform> eg: macosx, linux. You will need to download a third party program to allow you to run the 'make' command on Windows but the execution is the same. it can also be run on windows using visual Studio or on the command line("cl.exe must be in the environnent PATH variable").
Build Instructions
This Makefile and application is cross-platform.
make <platform> eg: macosx, linux etc.
I have successfully built this application on the following platforms: Windows 7, OpenSuse linux and Mac OSX Lion.
Summary of Findings
The execution of the program takes roughly 3.10 - 20 seconds (depending on how long you are measuring the calculations of triangle animations). it should be noted that this application initially was an application that relied upon user input for execution and for termination. I have modified this initial behavior by ensuring the while loop (which generates the triangles) executes only for a maximum of 1000 iterations. The time measured in this assignment is for every 1000 iterations of this loop.
Profiling Results
The results if the initial profile shows that the execution time is most greatly consumed when drawing the triangles out to the screen one at a time. It seems like this can be optimized by offloading this drawing to n threads based on n triangles to be drawn. But this is subject to change because of any additional complexity that may be introduced that may include interoperability with both the GPU and CPU.
There is another for loop which sets the dimensions for each triangle one by one in linear time O(n ). This process can also be out-sourced to the GPU in n threads for n triangles. I would need to determine if this process also involves interoperability between the CPU and GPU.
The complexity of the entire program is O(n^3). There is a for loop for setup, a while loop for accepting user input and another for loop for drawing the triangles.
Also the times recorded can be increase if the maximum loop iterations increase ie: 10000,100000,1000000. This will identify the same relationship but with higher task time.
Potential Parallelization Candidates
Upon analyzing this function I discovered two possible areas where I could optimize the code using threads sent to the GPU. The first is a for loop which sets the attributes for 100 triangles in serial. This task can be done in parallel using 100 threads on the GPU.
for (int k = 0; k<100; ++k) {
posx[k] = (float)(cimg::rand()*img0.width());
posy[k] = (float)(cimg::rand()*img0.height());
rayon[k] = (float)(10 + cimg::rand()*50);
angle[k] = (float)(cimg::rand()*360);
veloc[k] = (float)(cimg::rand()*20 - 10);
color[k][0] = (unsigned char)(cimg::rand()*255);
color[k][1] = (unsigned char)(cimg::rand()*255);
color[k][2] = (unsigned char)(cimg::rand()*255);
opacity[k] = (float)(0.3 + 1.5*cimg::rand());
}
</pre>
The second instance where this is possible is a bit tricky. It involves another serial for loop. The purpose of this loop is to draw each of the triangles on the screen and manipulate them later on. I am not 100 percent sure this can be done in parallel in practice but in theory it should be possible because the application is drawing out each triangle one by one.
<pre>
// Draw each triangle on the background image.
for (int k = 0; k<num; ++k) {
const int
x0 = (int)(posx[k] + rayon[k]*std::cos(angle[k]*cimg::PI/180)),
y0 = (int)(posy[k] + rayon[k]*std::sin(angle[k]*cimg::PI/180)),
x1 = (int)(posx[k] + rayon[k]*std::cos((angle[k] + 120)*cimg::PI/180)),
y1 = (int)(posy[k] + rayon[k]*std::sin((angle[k] + 120)*cimg::PI/180)),
x2 = (int)(posx[k] + rayon[k]*std::cos((angle[k] + 240)*cimg::PI/180)),
y2 = (int)(posy[k] + rayon[k]*std::sin((angle[k] + 240)*cimg::PI/180));
if (k%10) img.draw_triangle(x0,y0,x1,y1,x2,y2,color[k],opacity[k]);
else img.draw_triangle(x0,y0,x1,y1,x2,y2,img0,0,0,img0.width()-1,0,0,img.height()-1,opacity[k]);
img.draw_triangle(x0,y0,x1,y1,x2,y2,white,opacity[k],~0U);
// Make the triangles rotate, and check for mouse click event.
// (to make triangles collapse or join).
angle[k]+=veloc[k];
if (disp.mouse_x()>0 && disp.mouse_y()>0) {
float u = disp.mouse_x() - posx[k], v = disp.mouse_y() - posy[k];
if (disp.button()) { u = -u; v = -v; }
posx[k]-=0.03f*u, posy[k]-=0.03f*v;
if (posx[k]<0 || posx[k]>=img.width()) posx[k] = (float)(cimg::rand()*img.width());
if (posy[k]<0 || posy[k]>=img.height()) posy[k] = (float)(cimg::rand()*img.height());
}
}
Amdahls Law Calculations
Since there are 100 Triangles generated then we can theoretically create 100 threads for each triangle. The draw_line, draw_triangle, and draw_image functions take up 16 percent(0.38 + 0.08 + 0.06 / 3.10) of the execution time of the application in seconds. Plugging that into the equation using 100 cores we get:
S100 = 1/ 1 - 0.16 + 0.16 / 100
= 1.18 or 1.2 speedup is theoretically achievable rounded up PER 1000 iterations of the while loop to draw these triangles.
Will I work on this Project? If I can optimize this function or any other function within the CImg library I will continue with this project. If it is not possible to optimize this project within the given time of the course then it will be difficult to continue on with this project and I will have to work with someone else's project. But my initial plan is to continue with this project unless I am told otherwise.
Issues Encountered
Gprof on MacOSx with Intel processor
The profiling tool gprog does not work on the macbooks with an Intel processor installed (I have Intel Core i5). This was verified by numerous internet resources and annoying personal experience.
Makefile builds EVERYTHING
The Makefile shipped with this open source library is programmed to build all of the modules located in CImg_demo.cpp. I had to modify the Makefile to only build the specified function within the library of Image processing functions
Assignment 2
This assignment involved GPU programming the functions I had speculated I could process concurrently. After further analysis I determined that the second for loop cannot be processed in parallel without converting some underlying API functions to device functions. These functions are used in multiple areas in the framework so I did not pursue changing the framework too much. The first for loop however contained independent data that could be executed in parallel so I went ahead and created kernels for that code. I also noticed that the function made a call to cimg::rand(). This is obviously a function that returns random numbers. Luckily CUDA also has multiple libraries that perform these functions. Off the top of my head: Thrust and CURAND do. I went with CURAND because it has the word "rand" in it and that's what I need. hehe
Kernels
/*
* Setup and initialize curand with a seed
*/
__global__ void initCurand(curandState* state){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
curand_init(100, idx, 0, &state[idx]);
__syncthreads();
}
/*
* CUDA kernel that will execute 100 threads in parallel
* and will populate these parallel arrays with 100 random numbers
* array size = 100.
*/
__global__ void initializeArrays(float* posx, float* posy,float* rayon, float* veloc,
float* opacity ,float* angle, unsigned char* color, int height,
int width, curandState* state, size_t pitch){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
curandState localState = state[idx];
posx[idx] = (float)(curand_normal(&localState)*width);
posy[idx] = (float)(curand_normal(&localState)*height);
rayon[idx] = (float)(10 + curand_normal(&localState)*50);
angle[idx] = (float)(curand_normal(&localState)*360);
veloc[idx] = (float)(curand_uniform(&localState)*20 - 10);
color[idx*pitch] = (unsigned char)(curand_normal(&localState)*255);
color[(idx*pitch)+1] = (unsigned char)(curand_normal(&localState)*255);
color[(idx*pitch)+2] = (unsigned char)(curand_normal(&localState)*255);
opacity[idx] = (float)(0.3f + 1.5f *curand_normal(&localState));
__syncthreads();
}
// check for any errors returned by CUDA API functions.
void errCheck(cudaError_t err, const char* msg){
if (err != cudaSuccess)
std::cout<< msg << ": " << cudaGetErrorString(err) << std::endl;
}
Body Code
// check for any errors returned by CUDA API functions.
void errCheck(cudaError_t err, const char* msg){
if (err != cudaSuccess)
std::cout<< msg << ": " << cudaGetErrorString(err) << std::endl;
}
/*---------------------------
Main procedure
--------------------------*/
int main() {
// Create a colored 640x480 background image which consists of different color shades.
CImg<float> background(640,480,1,3);
cimg_forXY(background,x,y) background.fillC(x,y,0,
x*std::cos(6.0*y/background.height()) + y*std::sin(9.0*x/background.width()),
x*std::sin(8.0*y/background.height()) - y*std::cos(11.0*x/background.width()),
x*std::cos(13.0*y/background.height()) - y*std::sin(8.0*x/background.width()));
background.normalize(0,180);
// Init images and create display window.
CImg<unsigned char> img0(background), img;
unsigned char white[] = { 255, 255, 255 }, color[100][3];
CImgDisplay disp(img0,"[#6] - Filled Triangles (Click to shrink)");
// error handling
cudaError_t err;
// Define random properties (pos, size, colors, ..) for all triangles that will be displayed.
float posx[100];
float posy[100];
float rayon[100];
float angle[100];
float veloc[100];
float opacity[100];
// Define the same properties but for the device
float* d_posx;
float* d_posy;
float* d_rayon;
float* d_angle;
float* d_veloc;
float* d_opacity;
unsigned char* d_color;
// CURAND state
curandState* devState;
// allocate memory on the device for the device arrays
err = cudaMalloc((void**)&d_posx, 100 * sizeof(float));
errCheck(err, "cudaMalloc((void**)&d_posx, 100 * sizeof(float))");
err = cudaMalloc((void**)&d_posy, 100 * sizeof(float));
errCheck(err,"cudaMalloc((void**)&d_posy, 100 * sizeof(float))");
err = cudaMalloc((void**)&d_rayon, 100 * sizeof(float));
errCheck(err,"cudaMalloc((void**)&d_rayon, 100 * sizeof(float))");
err = cudaMalloc((void**)&d_angle, 100 * sizeof(float));
errCheck(err,"cudaMalloc((void**)&d_angle, 100 * sizeof(float))");
err = cudaMalloc((void**)&d_veloc, 100 * sizeof(float));
errCheck(err,"cudaMalloc((void**)&d_veloc, 100 * sizeof(float))");
err = cudaMalloc((void**)&d_opacity, 100 * sizeof(float));
errCheck(err,"cudaMalloc((void**)&d_opacity, 100 * sizeof(float))");
err = cudaMalloc((void**)&devState, 100*sizeof(curandState));
errCheck(err,"cudaMalloc((void**)&devState, 100*sizeof(curandState))");
size_t pitch;
//allocated the device memory for source array
err = cudaMallocPitch(&d_color, &pitch, 3 * sizeof(unsigned char),100);
errCheck(err,"cudaMallocPitch(&d_color, &pitch, 3 * sizeof(unsigned char),100)");
// launch grid of threads
dim3 dimBlock(100);
dim3 dimGrid(1);
/* Kernel for initializing CURAND */
initCurand<<<1,100>>>(devState);
// synchronize the device and the host
cudaDeviceSynchronize();
/*Kernel for initializing Arrays */
initializeArrays<<<1, 100>>>(d_posx, d_posy, d_rayon, d_veloc, d_opacity, d_angle,
d_color, img0.height(), img0.width(), devState, pitch);
// synchronize the device and the host
cudaDeviceSynchronize();
// get the populated arrays back to the host for use
err = cudaMemcpy(posx,d_posx, 100 * sizeof(float), cudaMemcpyDeviceToHost);
errCheck(err,"cudaMemcpy(posx,d_posx, 100 * sizeof(float), cudaMemcpyDeviceToHost)");
err = cudaMemcpy(posy,d_posy, 100 * sizeof(float), cudaMemcpyDeviceToHost);
errCheck(err,"cudaMemcpy(posy,d_posy, 100 * sizeof(float), cudaMemcpyDeviceToHost)");
err = cudaMemcpy(rayon,d_rayon, 100 * sizeof(float), cudaMemcpyDeviceToHost);
errCheck(err,"cudaMemcpy(rayon,d_rayon, 100 * sizeof(float), cudaMemcpyDeviceToHost)");
err = cudaMemcpy(veloc,d_veloc, 100 * sizeof(float), cudaMemcpyDeviceToHost);
errCheck(err,"cudaMemcpy(veloc,d_veloc, 100 * sizeof(float), cudaMemcpyDeviceToHost)");
err = cudaMemcpy(opacity,d_opacity, 100 * sizeof(float), cudaMemcpyDeviceToHost);
errCheck(err,"cudaMemcpy(opacity,d_opacity, 100 * sizeof(float), cudaMemcpyDeviceToHost)");
err = cudaMemcpy(angle,d_angle, 100 * sizeof(float), cudaMemcpyDeviceToHost);
errCheck(err,"cudaMemcpy(angle,d_angle, 100 * sizeof(float), cudaMemcpyDeviceToHost)");
// pitch of color array is 3+1 padded
err = cudaMemcpy2D(color,4,d_color,pitch,3 *sizeof(unsigned char),3, cudaMemcpyDeviceToHost);
errCheck(err,"cudaMemcpy2D(color,pitch,d_color,100*3,3 *sizeof(unsigned char),100* sizeof(unsigned char), cudaMemcpyDeviceToHost)");
// measuring time it takes for triangle animations in 1000 iterations
int i = 0, num = 1;
// Start animation loop.
while (!disp.is_closed() && !disp.is_keyQ() && !disp.is_keyESC() && i < 1000) {
img = img0;
i++;
// Draw each triangle on the background image.
for (int k = 0; k<num; ++k) {
const int
x0 = (int)(posx[k] + rayon[k]*std::cos(angle[k]*cimg::PI/180)),
y0 = (int)(posy[k] + rayon[k]*std::sin(angle[k]*cimg::PI/180)),
x1 = (int)(posx[k] + rayon[k]*std::cos((angle[k] + 120)*cimg::PI/180)),
y1 = (int)(posy[k] + rayon[k]*std::sin((angle[k] + 120)*cimg::PI/180)),
x2 = (int)(posx[k] + rayon[k]*std::cos((angle[k] + 240)*cimg::PI/180)),
y2 = (int)(posy[k] + rayon[k]*std::sin((angle[k] + 240)*cimg::PI/180));
if (k%10) img.draw_triangle(x0,y0,x1,y1,x2,y2,color[k],opacity[k]);
else img.draw_triangle(x0,y0,x1,y1,x2,y2,img0,0,0,img0.width()-1,0,0,img.height()-1,opacity[k]);
img.draw_triangle(x0,y0,x1,y1,x2,y2,white,opacity[k],~0U);
// Make the triangles rotate, and check for mouse click event.
// (to make triangles collapse or join).
angle[k]+=veloc[k];
if (disp.mouse_x()>0 && disp.mouse_y()>0) {
float u = disp.mouse_x() - posx[k], v = disp.mouse_y() - posy[k];
if (disp.button()) { u = -u; v = -v; }
posx[k]-=0.03f*u, posy[k]-=0.03f*v;
if (posx[k]<0 || posx[k]>=img.width()) posx[k] = (float)(cimg::rand()*img.width());
if (posy[k]<0 || posy[k]>=img.height()) posy[k] = (float)(cimg::rand()*img.height());
}
}
// Display current animation framerate, and refresh display window.
img.draw_text(5,5,"%u frames/s",white,0,0.5f,13,(unsigned int)disp.frames_per_second());
img0.resize(disp.display(img).resize(false).wait(20));
if (++num>100) num = 100;
// Allow the user to toggle fullscreen mode, by pressing CTRL+F.
if (disp.is_keyCTRLLEFT() && disp.is_keyF()) disp.resize(640,480,false).toggle_fullscreen(false);
}
// free allocated device memory
cudaFree(d_posy);
cudaFree(d_posx);
cudaFree(d_rayon);
cudaFree(d_veloc);
cudaFree(d_opacity);
cudaFree(d_color);
cudaFree(d_angle);
cudaFree(devState);
return 0;
}
Profiling Results
The entirety of the GPU code executes in 0.15 - 0.16 seconds. I profiled the application using nvvp and NSight on Visual Studio 2010.
Issues Encountered
Makefile does not work on the Windows platform. Tried adding the -lcurand linker flag but to no avail.
The code does compile cleanly without error on Visual Studio 2010 IDE.
Coding issues:
Stackoverflow Questions I made:
With the help of the open-source community I was able to solve my problems and gain a better understanding of CUDA and CURAND.