Difference between revisions of "The parallelizing Express"
Mradmanovic (talk | contribs) (→Assignment 2) |
(→Results) |
||
(19 intermediate revisions by 2 users not shown) | |||
Line 600: | Line 600: | ||
== Assignment 2 == | == Assignment 2 == | ||
+ | |||
+ | Source : [https://www.dropbox.com/s/5tf3jial3rg9d4k/a2.7z?dl=0 A2] | ||
=== Description === | === Description === | ||
'''Removing CPU Bottleneck''' | '''Removing CPU Bottleneck''' | ||
− | + | The CPU bottleneck in the ColorTransfer/main.cpp: | |
<pre> | <pre> | ||
Line 622: | Line 624: | ||
''' Added functions and changes''' | ''' Added functions and changes''' | ||
− | We wrote a matrix by vector multiplication | + | To fix this issue We wrote a device function which handles a matrix by vector multiplication. We also wrote a kernel which has equal logic to the host version so we could off load all the needed data to the kernel without needing to do multiple back and forth(s). We also wrote a helper function which will allocate all transfer all the opencv matrices to a suitable form for the kernel to deal with. |
Matrix by vector | Matrix by vector | ||
<pre> | <pre> | ||
− | void | + | __device__ void matvec(float* d_A, float* d_B, float* d_C) |
− | + | { | |
− | + | int tid = threadIdx.x + blockIdx.x * blockDim.x; | |
− | + | ||
− | float | + | float sum = 0; |
− | + | if (tid < 3) | |
− | + | { | |
− | + | for (int i = 0; i < 3; ++i) | |
− | + | { | |
− | + | sum += d_A[i] * d_B[(i * 3) + tid]; | |
− | + | } | |
− | + | ||
− | + | d_C[tid] = sum; | |
− | |||
− | |||
− | |||
} | } | ||
− | + | } | |
− | + | </pre> | |
− | + | ||
− | + | Kernel | |
− | + | <pre> | |
+ | __global__ void matvec_kernel(float* d_A, float* d_RGB2, float* d_LMS2, float* d_C, | ||
+ | const int n, int targetrows, int targetcols, float* d_Tar) | ||
+ | { | ||
+ | const double eps = 1.0e-4; | ||
+ | for (int y = 0; y < targetrows; ++y) { | ||
+ | for (int x = 0; x < targetcols; ++x) { | ||
+ | memcpy(&d_A, &d_Tar[y * 3 + x], 3 * sizeof(float)); | ||
+ | |||
+ | matvec(d_A, d_RGB2, d_C); | ||
+ | memcpy(&d_A, d_C, 3 * sizeof(float)); | ||
+ | |||
+ | for (int c = 0; c < 3; c++) | ||
+ | d_A[c] = d_A[c] > -5.0 ? pow((double)10.0, (double)d_A[c]) : eps; | ||
+ | |||
+ | matvec(d_A, d_LMS2, d_C); | ||
+ | memcpy(&d_Tar[y * 3 + x], d_C, 3 * sizeof(float)); | ||
+ | } | ||
} | } | ||
− | + | } | |
− | + | </pre> | |
− | + | ||
− | + | Helper | |
− | + | <pre> | |
− | // | + | inline void vecTransfer(float* h, Color3d* v) |
− | + | { | |
− | + | for (int j = 0; j < 3; ++j) | |
− | + | h[j] = v->v[j]; | |
+ | } | ||
+ | |||
+ | //KERNEL Helper function does setup and launch | ||
+ | void matvec_L(cv::Mat* mRGB2LMS, cv::Mat* mLMS2lab, float* h_C, int tarrow, int tarcol, float* h_Tar) | ||
+ | { | ||
+ | float *h_A, *h_RGB2, *h_LMS2, *d_Tar; | ||
+ | float *d_A, *d_RGB2, *d_LMS2, *d_C; | ||
+ | |||
+ | int N = 3; | ||
+ | |||
+ | h_A = (float*)malloc(sizeof(float) * N); | ||
+ | h_RGB2 = new float[mRGB2LMS->total()]; | ||
+ | h_LMS2 = new float[mLMS2lab->total()]; | ||
+ | //h_C = (float*)malloc(sizeof(float) * N); | ||
+ | |||
+ | cudaMalloc((void**)&d_A, sizeof(float) * N); | ||
+ | cudaMalloc((void**)&d_RGB2, sizeof(float) * N * N); | ||
+ | cudaMalloc((void**)&d_LMS2, sizeof(float) * N * N); | ||
+ | cudaMalloc((void**)&d_C, sizeof(h_C)); | ||
+ | cudaMalloc((void**)&d_Tar, sizeof(h_Tar)); | ||
+ | |||
+ | Color3d vec; | ||
+ | |||
+ | //copy vec and matrix to host pointers | ||
+ | vecTransfer(h_A, &vec); | ||
+ | memcpy(h_RGB2, mRGB2LMS->data, mRGB2LMS->total()); | ||
+ | memcpy(h_LMS2, mLMS2lab->data, mLMS2lab->total()); | ||
+ | |||
+ | cudaMemcpy(d_A, h_A, sizeof(float) * N, cudaMemcpyHostToDevice); | ||
+ | cudaMemcpy(d_RGB2, h_RGB2, sizeof(float) * N * N, cudaMemcpyHostToDevice); | ||
+ | cudaMemcpy(d_LMS2, h_LMS2, sizeof(float) * N * N, cudaMemcpyHostToDevice); | ||
+ | cudaMemcpy(d_Tar, h_Tar, sizeof(h_Tar), cudaMemcpyHostToDevice); | ||
+ | |||
+ | matvec_kernel << <N / BLOCK_SIZE + 1, BLOCK_SIZE >> >(d_A, d_RGB2, d_LMS2, d_C, N, tarrow, tarcol, d_Tar); | ||
+ | //printf("error code: %s\n",cudaGetErrorString(cudaGetLastError())); | ||
+ | |||
+ | cudaMemcpy(h_C, d_C, sizeof(h_C), cudaMemcpyDeviceToHost); | ||
+ | |||
+ | free(h_A); | ||
+ | free(h_RGB2); | ||
+ | free(h_LMS2); | ||
+ | //free(h_C); | ||
− | + | cudaFree(d_A); | |
− | + | cudaFree(d_RGB2); | |
− | + | cudaFree(d_LMS2); | |
− | + | cudaFree(d_C); | |
− | + | cudaFree(d_Tar); | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | cudaFree( | ||
− | cudaFree( | ||
− | cudaFree( | ||
} | } | ||
</pre> | </pre> | ||
+ | |||
Changes to main loop | Changes to main loop | ||
Line 686: | Line 730: | ||
Old | Old | ||
<pre> | <pre> | ||
− | + | for (int y = 0; y < target.rows; y++) { | |
− | for(int y=0; y<target.rows; y++) { | + | for (int x = 0; x < target.cols; x++) { |
− | + | v = target.at<Color3d>(y, x); | |
− | + | ||
− | + | matvec_L(&v, &mlab2LMS, h_C); | |
− | + | memcpy(&v, h_C, N * sizeof(float)); | |
+ | |||
+ | for (int c = 0; c < 3; c++) | ||
+ | v(c) = v(c) > -5.0 ? pow(10.0, v(c)) : eps; | ||
− | + | matvec_L(&v, &mLMS2RGB, h_C); | |
− | + | memcpy(&target.at<Color3d>(y, x), h_C, N * sizeof(float)); | |
+ | } | ||
} | } | ||
</pre> | </pre> | ||
Line 700: | Line 748: | ||
New | New | ||
<pre> | <pre> | ||
− | + | float* h_TARGET = (float *)malloc(sizeof(target.data)); | |
− | float* | + | memcpy(h_TARGET, target.data, sizeof(target.data)); |
+ | matvec_L(&mlab2LMS, &mLMS2RGB, h_C, rows, cols, h_TARGET); | ||
+ | </pre> | ||
+ | |||
+ | === Results === | ||
+ | |||
+ | For our profiling we had three test cases. | ||
+ | <pre> | ||
+ | (Size is in pixels) | ||
+ | Small : 400x400 by 400x400 | ||
+ | Medium : 1400x989 by 1215x717 | ||
+ | Large : 7362x4858 by 6000x4032 | ||
+ | </pre> | ||
− | + | [[File:a2timings.PNG]] | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | == Assignment 3 == | |
− | |||
− | + | === Link to Original Unchanged Project and project used for a3 === | |
− | + | ||
− | + | [https://www.dropbox.com/s/32q70e9iovpm80l/a2.7z?dl=0 Download] | |
+ | |||
+ | The file contains the visual studio solutions used for Assignment 3 as well as visual studio solution version of the original code. | ||
+ | |||
+ | To run the location of the OPENCV directory must be applied to the following project properties: | ||
+ | |||
+ | 1. Under c/c++ -> general -> additional include directories -> the path to opencv\..\..\include | ||
+ | 2. Under linker -> general -> additional library directories -> the path to opencv\..\..\lib | ||
+ | |||
+ | After doing the above, build the solution and run the appropriate Release or Debug exe with the target and reference image as arguments. | ||
+ | |||
+ | A link the the corresponding xls file of the run time can be found [https://www.dropbox.com/s/ecu7eycyv41krwq/a3.xlsx?dl=0 here] | ||
+ | |||
+ | === What was done === | ||
+ | |||
+ | At first the power function used was switched out with __pow in the kernel as the traditional pow function is more heavy of a function. But the results were very small causing a different from around 10-30 milliseconds. Afterwards the kernel was upgraded to implement grid and strides. In doing so instead of doing all the calculations on one thread, many of them were able to be calculated on a separate row. This made it so that for every pixel with a row one thread would be responsible for the colour shift. | ||
+ | |||
+ | Other implementations were made to transfer all the data necessary for calculations all at once from the beginning and then perform all calculations done by tatsy on the device side, but due to time constraints and the complication of the project we were unable to fully implement these changes. The code is however left (commented) in the included project download. | ||
+ | |||
+ | === Optimized Kernel === | ||
+ | <pre> | ||
+ | __global__ void matvec_kernel(float* d_A, float* d_RGB2, float* d_LMS2, float* d_C, | ||
+ | const int n, int targetrows, int targetcols, float* d_Tar) | ||
+ | { | ||
+ | const double eps = 1.0e-4; | ||
+ | //grid-stride loop | ||
+ | for (int tid = threadIdx.x + blockIdx.x * blockDim.x; | ||
+ | tid < targetrows; | ||
+ | tid += blockDim.x * gridDim.x) | ||
+ | { | ||
+ | for (int x = 0; x < targetcols; ++x) { | ||
+ | memcpy(&d_A, &d_Tar[tid * 3 + x], 3 * sizeof(float)); | ||
+ | |||
+ | matvec(d_A, d_RGB2, d_C); | ||
+ | memcpy(&d_A, d_C, 3 * sizeof(float)); | ||
+ | |||
+ | for (int c = 0; c < 3; c++) | ||
+ | d_A[c] = d_A[c] > -5.0 ? __powf(10.0f, d_A[c]) : eps; | ||
+ | |||
+ | matvec(d_A, d_LMS2, d_C); | ||
+ | memcpy(&d_Tar[tid * 3 + x], d_C, 3 * sizeof(float)); | ||
+ | } | ||
+ | } | ||
} | } | ||
</pre> | </pre> | ||
+ | |||
+ | === Results === | ||
+ | |||
+ | [[File:a3timings.PNG]] | ||
+ | |||
+ | |||
+ | When running in release and comparing the results to the original unchanged project straight from tatsy we noticed that for very small images the cuda version is slightly slower. This is probably due to the actual conversion of colour on a smaller image to be much shorter, whereas the transferring of the data over to device memory itself may add additional time. Overall when comparing the optimized and unoptimized versions of the kernel there was a visible increase. |
Latest revision as of 19:12, 13 April 2017
Contents
The parallelizing Express
o x o x o x o . . . o _____ _______________ ___=====__T___ .][__n_n_|DD[ ====_____ | |.\/.| | | |_| |_ >(________|__|_[_________]_|____|_/\_|___|_|___________|_| _/oo OOOOO oo` ooo ooo o^o o^o o^o o^o
-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-
Repository
Team Members
Email All
Progress
Assignment 1
SudokuMP by Marko R
SudokuMP solves sudoku puzzles given a txt file arranging the puzzle and the size of the puzzle. There are two methods to solve said puzzles the brute force method and the humanistic solver. Profiling wasn't performed on the humanistic solver since it manages to solve all the 16x16 puzzles extremely quickly the worst of them (expert) still being at around 12ms. The expert puzzle is too difficult for the brute force method and takes over an hour to run so it wasn't profiled. The project can be found here: SudokuMP
Running the Program
The git repository comes with a pre-compiled version of the program so you can run it immediately (you might need to give it the correct permissions to run). The git repository also comes with a make file so if you want to compile your own version delete the pre-compiled version and run make. The program takes two command line arguments the first being the path to the txt file containing the sudoku puzzle and the second argument being the size of the puzzle cells.
16x16 easy 1
Command line: ./sudoku src/16x16/easy1.txt 16
Before : After : 00 00 00 00 | 00 00 00 08 | 00 00 07 00 | 00 05 04 00 11 01 09 10 | 06 13 03 08 | 02 14 07 12 | 16 05 04 15 00 00 04 00 | 00 02 16 00 | 01 05 00 11 | 09 12 00 00 06 15 04 13 | 10 02 16 14 | 01 05 03 11 | 09 12 08 07 00 08 00 07 | 05 00 15 11 | 09 04 16 00 | 02 00 00 00 14 08 03 07 | 05 12 15 11 | 09 04 16 06 | 02 10 13 01 05 00 12 16 | 01 09 00 00 | 00 13 00 00 | 00 00 00 06 05 02 12 16 | 01 09 07 04 | 08 13 10 15 | 03 11 14 06 ----------------------------------------------------- ----------------------------------------------------- 15 00 10 00 | 00 00 02 00 | 00 00 05 00 | 00 00 16 03 15 11 10 09 | 08 07 02 12 | 13 01 05 04 | 14 06 16 03 00 07 00 00 | 00 00 00 10 | 06 15 08 00 | 00 09 02 00 12 07 14 03 | 13 05 04 10 | 06 15 08 16 | 01 09 02 11 01 00 13 00 | 14 00 00 00 | 10 07 00 09 | 00 08 00 05 01 16 13 04 | 14 06 11 03 | 10 07 02 09 | 12 08 15 05 08 06 00 00 | 00 00 01 00 | 14 00 00 03 | 13 04 00 10 08 06 02 05 | 09 15 01 16 | 14 12 11 03 | 13 04 07 10 ----------------------------------------------------- ----------------------------------------------------- 13 00 16 06 | 15 00 00 09 | 00 11 00 00 | 00 00 01 02 13 14 16 06 | 15 10 05 09 | 04 11 12 08 | 07 03 01 02 03 00 08 00 | 12 00 06 01 | 00 00 00 07 | 00 16 00 13 03 10 08 11 | 12 04 06 01 | 15 02 14 07 | 05 16 09 13 00 05 01 00 | 00 14 08 07 | 16 00 00 00 | 00 00 12 00 09 05 01 02 | 03 14 08 07 | 16 06 13 10 | 11 15 12 04 07 04 00 00 | 00 11 00 00 | 00 09 00 00 | 00 14 00 08 07 04 15 12 | 16 11 13 02 | 03 09 01 05 | 10 14 06 08 ----------------------------------------------------- ----------------------------------------------------- 16 00 00 00 | 00 00 12 00 | 00 00 06 01 | 15 02 00 09 16 13 07 14 | 04 03 12 05 | 11 08 06 01 | 15 02 10 09 00 00 00 01 | 00 08 10 06 | 12 16 00 13 | 04 00 03 00 02 09 05 01 | 11 08 10 06 | 12 16 15 13 | 04 07 03 14 00 00 11 08 | 02 00 09 15 | 00 03 04 00 | 00 13 00 00 10 12 11 08 | 02 01 09 15 | 07 03 04 14 | 06 13 05 16 00 03 06 00 | 00 16 00 00 | 05 00 00 00 | 00 00 00 00 04 03 06 15 | 07 16 14 13 | 05 10 09 02 | 08 01 11 12
NOTE : The profiles for this project were quite large so only the most used functions are listed here. The entire profile file can be find through the links provided above the respective flat profile.
Link to full profile : FULL
Flat Profile :
Each sample counts as 0.01 seconds. no time accumulated % cumulative self self total time seconds seconds calls s/call s/call name 65.87 0.81 0.81 59789 0.00 0.00 create_copy_board(Board*) 21.14 1.07 0.26 59777 0.00 0.00 choose_cell_bf(Board*, int&, int&) 6.51 1.15 0.08 59790 0.00 0.00 Board::Board(int) 6.51 1.23 0.08 59777 0.00 0.00 Board::~Board() 0.00 1.23 0.00 179709 0.00 0.00 clear_number(Board*, int, Align, int) 0.00 1.23 0.00 179370 0.00 0.00 Board* const& std::forward<Board* const&>(std::remove_reference<Board* const&>::type&) 0.00 1.23 0.00 59903 0.00 0.00 update_solution(Board*, int, int, int) 0.00 1.23 0.00 59790 0.00 0.00 _ZN9__gnu_cxx13new_allocatorIP5BoardE9constructIS2_IRKS2_EEEvPT_DpOT0_ 0.00 1.23 0.00 59790 0.00 0.00 std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__construct_helper<Board*, Board* const&>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&) 0.00 1.23 0.00 59790 0.00 0.00 decltype (_S_construct({parm#1}, {parm#2}, (forward<Board* const&>)({parm#3}))) std::allocator_traits<std::allocator<Board*> >::construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&) 0.00 1.23 0.00 59790 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::push_back(Board* const&) 0.00 1.23 0.00 59790 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::push(Board* const&) 0.00 1.23 0.00 59790 0.00 0.00 operator new(unsigned long, void*) 0.00 1.23 0.00 59784 0.00 0.00 std::_Deque_iterator<Board*, Board*&, Board**>::_Deque_iterator(std::_Deque_iterator<Board*, Board*&, Board**> const&) 0.00 1.23 0.00 59780 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::end() 0.00 1.23 0.00 59778 0.00 0.00 void __gnu_cxx::new_allocator<Board*>::destroy<Board*>(Board**) 0.00 1.23 0.00 59778 0.00 0.00 std::_Deque_iterator<Board*, Board*&, Board**>::operator*() const 0.00 1.23 0.00 59778 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::empty() const 0.00 1.23 0.00 59778 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::empty() const 0.00 1.23 0.00 59778 0.00 0.00 std::_Deque_iterator<Board*, Board*&, Board**>::operator--() 0.00 1.23 0.00 59778 0.00 0.00 std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__destroy_helper<Board*>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_destroy<Board*>(std::allocator<Board*>&, Board**) 0.00 1.23 0.00 59778 0.00 0.00 void std::allocator_traits<std::allocator<Board*> >::destroy<Board*>(std::allocator<Board*>&, Board**) 0.00 1.23 0.00 59778 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::back() 0.00 1.23 0.00 59778 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::pop_back() 0.00 1.23 0.00 59778 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::pop() 0.00 1.23 0.00 59778 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::top() 0.00 1.23 0.00 59778 0.00 0.00 bool std::operator==<Board*, Board*&, Board**>(std::_Deque_iterator<Board*, Board*&, Board**> const&, std::_Deque_iterator<Board*, Board*&, Board**> const&) 0.00 1.23 0.00 59777 0.00 0.00 update_stack(Board*) 0.00 1.23 0.00 24 0.00 0.00 std::__deque_buf_size(unsigned long)
16x16 medium 1
Command line: ./sudoku src/16x16/medium1.txt 16
Before : After : 00 00 06 15 | 00 12 09 04 | 11 02 07 16 | 00 03 00 13 05 01 06 15 | 08 12 09 04 | 11 02 07 16 | 14 03 10 13 00 10 00 00 | 01 00 00 15 | 12 00 08 00 | 00 02 00 00 14 10 13 03 | 01 05 06 15 | 12 09 08 04 | 16 02 11 07 07 11 16 02 | 00 00 00 03 | 05 15 01 06 | 09 00 00 00 07 11 16 02 | 13 10 14 03 | 05 15 01 06 | 09 04 12 08 00 12 00 00 | 07 11 00 00 | 00 03 13 14 | 06 15 05 00 08 12 09 04 | 07 11 16 02 | 10 03 13 14 | 06 15 05 01 ----------------------------------------------------- ----------------------------------------------------- 00 06 12 00 | 00 00 11 00 | 00 00 03 10 | 00 13 14 15 04 06 12 01 | 02 09 11 08 | 16 07 03 10 | 05 13 14 15 00 00 10 00 | 00 14 00 00 | 00 01 00 00 | 00 00 09 00 03 16 10 07 | 15 14 05 13 | 06 01 04 12 | 11 08 09 02 02 09 00 00 | 03 00 00 00 | 14 00 15 05 | 12 00 06 00 02 09 11 08 | 03 16 10 07 | 14 13 15 05 | 12 01 06 04 15 00 00 13 | 00 06 12 00 | 09 00 00 00 | 10 07 00 03 15 05 14 13 | 04 06 12 01 | 09 08 11 02 | 10 07 16 03 ----------------------------------------------------- ----------------------------------------------------- 10 02 00 00 | 05 03 00 14 | 15 06 00 01 | 08 09 00 11 10 02 07 16 | 05 03 13 14 | 15 06 12 01 | 08 09 04 11 00 00 00 14 | 00 00 01 06 | 04 00 00 00 | 07 00 00 00 12 13 03 14 | 09 15 01 06 | 04 11 10 08 | 07 16 02 05 11 00 08 09 | 00 00 07 00 | 00 14 00 00 | 00 06 15 12 11 04 08 09 | 10 02 07 16 | 03 14 05 13 | 01 06 15 12 00 15 01 00 | 11 00 08 00 | 00 00 00 07 | 00 00 03 00 06 15 01 05 | 11 04 08 12 | 02 16 09 07 | 13 14 03 10 ----------------------------------------------------- ----------------------------------------------------- 16 00 02 00 | 14 00 00 10 | 13 05 00 15 | 04 00 01 00 16 07 02 11 | 14 08 03 10 | 13 05 06 15 | 04 12 01 09 00 00 00 00 | 00 00 00 00 | 08 00 00 00 | 00 00 07 14 01 03 15 06 | 12 13 04 09 | 08 10 16 11 | 02 05 07 14 00 00 00 00 | 16 00 00 00 | 00 00 14 03 | 00 00 00 00 09 08 05 12 | 16 01 02 11 | 07 04 14 03 | 15 10 13 06 00 00 00 00 | 06 00 15 05 | 01 12 00 00 | 00 11 08 00 13 14 04 10 | 06 07 15 05 | 01 12 02 09 | 03 11 08 16
Link to full profile : FULL
Flat Profile :
Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls ms/call ms/call name 70.23 0.33 0.33 23099 0.01 0.02 create_copy_board(Board*) 14.90 0.40 0.07 23083 0.00 0.00 choose_cell_bf(Board*, int&, int&) 10.64 0.45 0.05 23100 0.00 0.00 Board::Board(int) 4.26 0.47 0.02 23083 0.00 0.00 Board::~Board() 0.00 0.47 0.00 69645 0.00 0.00 clear_number(Board*, int, Align, int) 0.00 0.47 0.00 69300 0.00 0.00 Board* const& std::forward<Board* const&>(std::remove_reference<Board* const&>::type&) 0.00 0.47 0.00 23215 0.00 0.00 update_solution(Board*, int, int, int) 0.00 0.47 0.00 23100 0.00 0.00 _ZN9__gnu_cxx13new_allocatorIP5BoardE9constructIS2_IRKS2_EEEvPT_DpOT0_ 0.00 0.47 0.00 23100 0.00 0.00 std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__construct_helper<Board*, Board* const&>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&) 0.00 0.47 0.00 23100 0.00 0.00 decltype (_S_construct({parm#1}, {parm#2}, (forward<Board* const&>)({parm#3}))) std::allocator_traits<std::allocator<Board*> >::construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&) 0.00 0.47 0.00 23100 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::push_back(Board* const&) 0.00 0.47 0.00 23100 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::push(Board* const&) 0.00 0.47 0.00 23100 0.00 0.00 operator new(unsigned long, void*) 0.00 0.47 0.00 23090 0.00 0.00 std::_Deque_iterator<Board*, Board*&, Board**>::_Deque_iterator(std::_Deque_iterator<Board*, Board*&, Board**> const&) 0.00 0.47 0.00 23086 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::end() 0.00 0.47 0.00 23084 0.00 0.00 void __gnu_cxx::new_allocator<Board*>::destroy<Board*>(Board**) 0.00 0.47 0.00 23084 0.00 0.00 std::_Deque_iterator<Board*, Board*&, Board**>::operator*() const 0.00 0.47 0.00 23084 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::empty() const 0.00 0.47 0.00 23084 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::empty() const 0.00 0.47 0.00 23084 0.00 0.00 std::_Deque_iterator<Board*, Board*&, Board**>::operator--() 0.00 0.47 0.00 23084 0.00 0.00 std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__destroy_helper<Board*>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_destroy<Board*>(std::allocator<Board*>&, Board**) 0.00 0.47 0.00 23084 0.00 0.00 void std::allocator_traits<std::allocator<Board*> >::destroy<Board*>(std::allocator<Board*>&, Board**) 0.00 0.47 0.00 23084 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::back() 0.00 0.47 0.00 23084 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::pop_back() 0.00 0.47 0.00 23084 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::pop() 0.00 0.47 0.00 23084 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::top() 0.00 0.47 0.00 23084 0.00 0.00 bool std::operator==<Board*, Board*&, Board**>(std::_Deque_iterator<Board*, Board*&, Board**> const&, std::_Deque_iterator<Board*, Board*&, Board**> const&) 0.00 0.47 0.00 23083 0.00 0.02 update_stack(Board*) 0.00 0.47 0.00 24 0.00 0.00 std::__deque_buf_size(unsigned long)
16x16 hard 1
Command line: ./sudoku src/16x16/hard1.txt 16
Before : After : 02 00 00 00 | 00 00 00 16 | 00 06 00 00 | 00 00 00 05 02 07 14 03 | 09 15 13 16 | 04 06 10 08 | 01 11 12 05 00 10 08 00 | 00 00 00 00 | 00 00 00 00 | 00 13 15 00 04 10 08 06 | 05 01 11 12 | 02 07 14 03 | 16 13 15 09 00 01 12 05 | 00 00 00 00 | 00 00 00 00 | 10 04 08 00 11 01 12 05 | 03 07 02 14 | 13 09 16 15 | 10 04 08 06 13 00 15 09 | 06 00 00 00 | 00 00 01 12 | 07 02 00 00 13 16 15 09 | 06 08 04 10 | 11 05 01 12 | 07 02 14 03 ----------------------------------------------------- ----------------------------------------------------- 00 00 00 08 | 12 11 00 00 | 00 00 03 02 | 09 00 00 00 01 06 04 08 | 12 11 07 05 | 16 14 03 02 | 09 10 13 15 16 00 00 00 | 15 13 10 00 | 00 08 06 04 | 00 00 00 00 16 03 02 14 | 15 13 10 09 | 01 08 06 04 | 05 07 11 12 00 00 13 00 | 00 04 01 06 | 07 12 05 00 | 00 00 00 00 10 09 13 15 | 08 04 01 06 | 07 12 05 11 | 03 16 02 14 00 00 00 00 | 14 02 16 03 | 10 15 09 00 | 00 00 00 00 07 05 11 12 | 14 02 16 03 | 10 15 09 13 | 06 01 04 08 ----------------------------------------------------- ----------------------------------------------------- 15 00 00 16 | 10 00 00 00 | 00 00 04 05 | 00 00 00 07 15 02 09 16 | 10 06 08 13 | 12 01 04 05 | 11 14 03 07 00 11 00 07 | 00 00 15 00 | 00 10 00 06 | 04 00 05 01 14 11 03 07 | 16 09 15 02 | 08 10 13 06 | 04 12 05 01 12 00 05 01 | 00 03 14 00 | 00 16 02 00 | 13 08 00 10 12 04 05 01 | 07 03 14 11 | 15 16 02 09 | 13 08 06 10 00 13 06 00 | 00 00 00 04 | 14 00 00 00 | 02 15 09 00 08 13 06 10 | 01 05 12 04 | 14 03 11 07 | 02 15 09 16 ----------------------------------------------------- ----------------------------------------------------- 03 00 07 11 | 00 16 00 00 | 00 00 15 00 | 08 05 00 04 03 12 07 11 | 02 16 09 01 | 06 13 15 14 | 08 05 10 04 00 00 00 13 | 00 00 05 08 | 03 11 00 00 | 14 00 00 00 06 15 01 13 | 04 12 05 08 | 03 11 07 10 | 14 09 16 02 00 00 00 04 | 11 00 00 00 | 00 00 00 16 | 15 00 00 00 05 08 10 04 | 11 14 03 07 | 09 02 12 16 | 15 06 01 13 00 00 00 00 | 13 10 06 15 | 05 04 08 01 | 00 00 00 00 09 14 16 02 | 13 10 06 15 | 05 04 08 01 | 12 03 07 11
Link to full profile : FULL
Flat Profile :
Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls s/call s/call name 66.17 5.94 5.94 432968 0.00 0.00 create_copy_board(Board*) 22.28 7.94 2.00 432950 0.00 0.00 choose_cell_bf(Board*, int&, int&) 6.02 8.48 0.54 432969 0.00 0.00 Board::Board(int) 4.46 8.88 0.40 432950 0.00 0.00 Board::~Board() 0.67 8.94 0.06 1299222 0.00 0.00 clear_number(Board*, int, Align, int) 0.22 8.96 0.02 433074 0.00 0.00 update_solution(Board*, int, int, int) 0.22 8.98 0.02 432950 0.00 0.00 update_stack(Board*) 0.00 8.98 0.00 1298907 0.00 0.00 Board* const& std::forward<Board* const&>(std::remove_reference<Board* const&>::type&) 0.00 8.98 0.00 432969 0.00 0.00 _ZN9__gnu_cxx13new_allocatorIP5BoardE9constructIS2_IRKS2_EEEvPT_DpOT0_ 0.00 8.98 0.00 432969 0.00 0.00 std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__construct_helper<Board*, Board* const&>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&) 0.00 8.98 0.00 432969 0.00 0.00 decltype (_S_construct({parm#1}, {parm#2}, (forward<Board* const&>)({parm#3}))) std::allocator_traits<std::allocator<Board*> >::construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&) 0.00 8.98 0.00 432969 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::push_back(Board* const&) 0.00 8.98 0.00 432969 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::push(Board* const&) 0.00 8.98 0.00 432969 0.00 0.00 operator new(unsigned long, void*) 0.00 8.98 0.00 432957 0.00 0.00 std::_Deque_iterator<Board*, Board*&, Board**>::_Deque_iterator(std::_Deque_iterator<Board*, Board*&, Board**> const&) 0.00 8.98 0.00 432953 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::end() 0.00 8.98 0.00 432951 0.00 0.00 void __gnu_cxx::new_allocator<Board*>::destroy<Board*>(Board**) 0.00 8.98 0.00 432951 0.00 0.00 std::_Deque_iterator<Board*, Board*&, Board**>::operator*() const 0.00 8.98 0.00 432951 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::empty() const 0.00 8.98 0.00 432951 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::empty() const 0.00 8.98 0.00 432951 0.00 0.00 std::_Deque_iterator<Board*, Board*&, Board**>::operator--() 0.00 8.98 0.00 432951 0.00 0.00 std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__destroy_helper<Board*>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_destroy<Board*>(std::allocator<Board*>&, Board**) 0.00 8.98 0.00 432951 0.00 0.00 void std::allocator_traits<std::allocator<Board*> >::destroy<Board*>(std::allocator<Board*>&, Board**) 0.00 8.98 0.00 432951 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::back() 0.00 8.98 0.00 432951 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::pop_back() 0.00 8.98 0.00 432951 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::pop() 0.00 8.98 0.00 432951 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::top() 0.00 8.98 0.00 432951 0.00 0.00 bool std::operator==<Board*, Board*&, Board**>(std::_Deque_iterator<Board*, Board*&, Board**> const&, std::_Deque_iterator<Board*, Board*&, Board**> const&) 0.00 8.98 0.00 24 0.00 0.00 std::__deque_buf_size(unsigned long) 0.00 8.98 0.00 6 0.00 0.00 std::_Deque_iterator<State*, State*&, State**>::_Deque_iterator(std::_Deque_iterator<State*, State*&, State**> const&)
16x16 hard 2
Command line: ./sudoku src/16x16/hard2.txt 16
Before : After : 00 00 00 13 | 15 00 00 07 | 08 11 00 16 | 00 00 00 00 10 03 14 13 | 15 12 05 07 | 08 11 09 16 | 01 04 02 06 00 00 00 00 | 00 01 14 00 | 00 00 00 02 | 00 00 05 00 07 15 11 16 | 03 01 14 13 | 10 06 04 02 | 12 08 05 09 00 00 08 00 | 10 00 00 00 | 00 00 15 12 | 00 00 03 14 09 06 08 01 | 10 11 02 04 | 05 07 15 12 | 13 16 03 14 00 00 00 00 | 06 00 00 09 | 01 00 13 00 | 00 10 07 00 04 05 02 12 | 06 16 08 09 | 01 14 13 03 | 15 10 07 11 ----------------------------------------------------- ----------------------------------------------------- 00 07 05 00 | 12 10 00 08 | 00 00 06 00 | 00 13 00 00 03 07 05 02 | 12 10 11 08 | 14 04 06 15 | 16 13 09 01 00 00 00 11 | 00 00 00 00 | 07 00 00 00 | 00 15 00 03 12 08 01 11 | 14 13 16 06 | 07 02 05 09 | 04 15 10 03 00 00 06 04 | 00 15 09 00 | 12 03 11 00 | 00 00 00 07 16 13 06 04 | 02 15 09 01 | 12 03 11 10 | 05 14 08 07 14 00 00 15 | 00 00 00 00 | 00 00 00 13 | 06 00 00 00 14 09 10 15 | 05 04 07 03 | 16 01 08 13 | 06 12 11 02 ----------------------------------------------------- ----------------------------------------------------- 00 04 00 09 | 07 00 00 00 | 15 00 00 00 | 00 11 00 10 02 04 13 09 | 07 14 12 05 | 15 16 03 06 | 08 11 01 10 01 14 00 00 | 00 00 03 00 | 00 09 07 00 | 00 00 13 04 01 14 12 10 | 16 06 03 15 | 11 09 07 08 | 02 05 13 04 05 00 00 00 | 00 08 00 11 | 04 10 00 00 | 14 00 00 15 05 16 03 06 | 09 08 13 11 | 04 10 02 01 | 14 07 12 15 00 00 15 07 | 00 02 00 00 | 00 00 14 00 | 00 00 06 00 08 11 15 07 | 04 02 01 10 | 13 12 14 05 | 09 03 06 16 ----------------------------------------------------- ----------------------------------------------------- 00 10 09 00 | 00 00 00 14 | 00 13 00 00 | 00 00 00 08 11 10 09 05 | 01 07 15 14 | 02 13 12 04 | 03 06 16 08 00 02 00 08 | 11 00 04 00 | 03 00 16 00 | 00 00 00 00 06 02 07 08 | 11 09 04 12 | 03 05 16 14 | 10 01 15 13 13 00 00 00 | 00 05 00 16 | 00 15 00 07 | 11 00 14 12 13 01 04 03 | 08 05 06 16 | 09 15 10 07 | 11 02 14 12 00 12 00 00 | 13 03 10 00 | 06 08 00 11 | 00 00 00 00 15 12 16 14 | 13 03 10 02 | 06 08 01 11 | 07 09 04 05
Link to full profile : FULL
Flat Profile :
Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls s/call s/call name 63.01 4.68 4.68 347198 0.00 0.00 create_copy_board(Board*) 24.77 6.52 1.84 347184 0.00 0.00 choose_cell_bf(Board*, int&, int&) 7.54 7.08 0.56 347199 0.00 0.00 Board::Board(int) 4.04 7.38 0.30 347184 0.00 0.00 Board::~Board() 0.54 7.42 0.04 1041882 0.00 0.00 clear_number(Board*, int, Align, int) 0.13 7.43 0.01 347294 0.00 0.00 update_solution(Board*, int, int, int) 0.00 7.43 0.00 1041597 0.00 0.00 Board* const& std::forward<Board* const&>(std::remove_reference<Board* const&>::type&) 0.00 7.43 0.00 347199 0.00 0.00 _ZN9__gnu_cxx13new_allocatorIP5BoardE9constructIS2_IRKS2_EEEvPT_DpOT0_ 0.00 7.43 0.00 347199 0.00 0.00 std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__construct_helper<Board*, Board* const&>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&) 0.00 7.43 0.00 347199 0.00 0.00 decltype (_S_construct({parm#1}, {parm#2}, (forward<Board* const&>)({parm#3}))) std::allocator_traits<std::allocator<Board*> >::construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&) 0.00 7.43 0.00 347199 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::push_back(Board* const&) 0.00 7.43 0.00 347199 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::push(Board* const&) 0.00 7.43 0.00 347199 0.00 0.00 operator new(unsigned long, void*) 0.00 7.43 0.00 347191 0.00 0.00 std::_Deque_iterator<Board*, Board*&, Board**>::_Deque_iterator(std::_Deque_iterator<Board*, Board*&, Board**> const&) 0.00 7.43 0.00 347187 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::end() 0.00 7.43 0.00 347185 0.00 0.00 void __gnu_cxx::new_allocator<Board*>::destroy<Board*>(Board**) 0.00 7.43 0.00 347185 0.00 0.00 std::_Deque_iterator<Board*, Board*&, Board**>::operator*() const 0.00 7.43 0.00 347185 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::empty() const 0.00 7.43 0.00 347185 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::empty() const 0.00 7.43 0.00 347185 0.00 0.00 std::_Deque_iterator<Board*, Board*&, Board**>::operator--() 0.00 7.43 0.00 347185 0.00 0.00 std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__destroy_helper<Board*>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_destroy<Board*>(std::allocator<Board*>&, Board**) 0.00 7.43 0.00 347185 0.00 0.00 void std::allocator_traits<std::allocator<Board*> >::destroy<Board*>(std::allocator<Board*>&, Board**) 0.00 7.43 0.00 347185 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::back() 0.00 7.43 0.00 347185 0.00 0.00 std::deque<Board*, std::allocator<Board*> >::pop_back() 0.00 7.43 0.00 347185 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::pop() 0.00 7.43 0.00 347185 0.00 0.00 std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::top() 0.00 7.43 0.00 347185 0.00 0.00 bool std::operator==<Board*, Board*&, Board**>(std::_Deque_iterator<Board*, Board*&, Board**> const&, std::_Deque_iterator<Board*, Board*&, Board**> const&) 0.00 7.43 0.00 347184 0.00 0.00 update_stack(Board*) 0.00 7.43 0.00 24 0.00 0.00 std::__deque_buf_size(unsigned long)
Analysis / Code Snippet
The two functions which dominate the runtime of this program are create_copy_board which accounts for roughly 60% or more of the exceution time and choose_cell_bf which accounts for another 20% generally.
Board* create_copy_board(Board* b){ Board* copy_board = new Board(b->dim); for(int row = 0; row < b->dim; row++) { for(int col = 0; col < b->dim; col++) { copy_board->solution[row][col] = b->solution[row][col]; for(int index = 0; index < b->dim; index++) { copy_board->cells[row][col][index] = b->cells[row][col][index]; } } } copy_board->cells_solved = b->cells_solved; return copy_board; } /* Choose cell with the least number of possible values * Return 0 if that cell has 0 values */ bool choose_cell_bf(Board* b, int &row, int &col) { int least = b->dim+1, counter; for(int r = 0; r < b->dim; r++) { for(int c = 0; c < b->dim; c++) { counter = 0; if(!b->solution[r][c]) { for(int num = 0; num < b->dim; num++) { if(b->cells[r][c][num]) { counter++; } } if(counter < least) { least = counter; row = r; col = c; } } } } return (least != 0); }
ColorTransfer by Jadrian S
The Source code can be found here This program is an implementation of image color transfer detailed by Erik Reinhard and coded by Tatsy it use OpenCV to do so. This program takes 2 images the first image denoted by the args is the image being editted with colours. The second image is an image that the program will take the colours from. After the color transfer an image is generated with the color transfer completed and is opened for the user to view.
Creating and Running the Project with gprof profiling
Perform the following steps in a Linux/Unix System:
STEP 1 - Download the following zips and extract their contents as they will be used for later steps
-> test files for opencv:Download OR File:Test.zip -> ColorTransfer:Download OR File:ColorTransfer.zip
STEP 2 - Install all the required entries(make,cmake,git,gprof,etc) with the following terminal command:
-> sudo apt-get install build-essential cmake git libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev libswscale-dev binutils
STEP 3 - Download and Install OpenCV from their official page located here. You can do this with the following steps and commands:
-> Download the latest source data of OpenCV for Linux/Mac -> Extract the zipped source data into its appropriately named folder opencv-(latest version) -> Open terminal and enter us the cd command to enter the extracted folder opencv-(latest version) -> Create a build directy with the command mkdir build -> Use the cd command to enter the build folder -> Use the following cmake command to generate the appropriate files for the make command: cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local .. -> Use make with the -j7 command to allow for up to 7 jobs when preparing the install: make -j7 -> Install with: sudo make install -> OpenCV should be installed
STEP 4 - Use the Test Folder provided by the dropbox link above to see if OpenCV is working correctly. You can follow these steps to do so
-> In terminal cd into the test folder -> cmake . -> make -> run display image with the follow command ./DisplayImage ahri.jpg -> If the image provided is displayed OpenCV should be in working order.
STEP 5 - Build the ColorTransfer program with the folder provided by the dropbox link and create a flat profile using gprof
-> Create the files for make using the following command: cmake -DCMAKE_CXX_FLAGS=-pg -DCMAKE_EXE_LINKER_FLAGS=-pg -DCMAKE_SHARED_LINKER_FLAGS=-pg . -> The additional options include -pg within the cmake so that the appropriate gmon.out can be created for gprof -> After the files are generated use the make command -> Run the created program with 2 arguments arg1 being the image beind editted and arg2 being the image to take colours from (two images are provided for your convenience). -> If using the provided image the command should like: ./ColorTransfer ahri.jpg ahri2.jpg -> This should generate the colour transferred output image as well as the appropriate gmon.out file -> Use the either of the following commands for the flat profile: gprof -p -b ./ColorTransfer > ColorTransfer.flt OR gprof test_gprof gmon.out > analysis.txt
Example Output
If using the examples provided you should have the following image results:
Input 1
Input 2
Output
Profile
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total time seconds seconds calls ms/call ms/call name 36.65 0.37 0.37 187749717 0.00 0.00 Color3d::operator()(int) 22.78 0.60 0.23 6771110 0.00 0.00 operator*(cv::Mat const&, Color3d&) 13.37 0.74 0.14 60939990 0.00 0.00 double const& cv::Mat::at<double>(int, int) const 11.89 0.86 0.12 main 4.95 0.91 0.05 14413379 0.00 0.00 Color3d::operator=(Color3d const&) 3.96 0.95 0.04 13156175 0.00 0.00 Color3d& cv::Mat::at<Color3d>(int, int) 1.98 0.97 0.02 7543200 0.00 0.00 double& cv::Mat::at<double>(int, int) 0.99 0.98 0.01 13156184 0.00 0.00 Color3d::Color3d() 0.99 0.99 0.01 2128357 0.00 0.00 Color3d::operator*(Color3d const&) 0.99 1.00 0.01 frame_dummy 0.50 1.00 0.01 4256710 0.00 0.00 Color3d::operator+(Color3d const&) 0.50 1.01 0.01 1 5.00 5.00 cvflann::anyimpl::big_any_policy<cvflann::anyimpl::empty_any>::big_any_policy() 0.50 1.01 0.01 Color3d::Color3d(Color3d const&) 0.00 1.01 0.00 44 0.00 0.00 cv::Mat::release() 0.00 1.01 0.00 17 0.00 0.00 cv::_InputArray::init(int, void const*) 0.00 1.01 0.00 14 0.00 0.00 cv::Mat::~Mat() 0.00 1.01 0.00 11 0.00 0.00 cv::_InputArray::~_InputArray() 0.00 1.01 0.00 11 0.00 0.00 cv::Size_<int>::Size_() 0.00 1.01 0.00 9 0.00 0.00 cvflann::anyimpl::base_any_policy::base_any_policy() 0.00 1.01 0.00 9 0.00 0.00 cvflann::anyimpl::base_any_policy::~base_any_policy() 0.00 1.01 0.00 6 0.00 0.00 cv::_InputArray::_InputArray() 0.00 1.01 0.00 6 0.00 0.00 cv::_OutputArray::_OutputArray(cv::Mat&) 0.00 1.01 0.00 6 0.00 0.00 cv::_OutputArray::~_OutputArray() 0.00 1.01 0.00 6 0.00 0.00 cv::MatSize::MatSize(int*) 0.00 1.01 0.00 6 0.00 0.00 cv::MatStep::MatStep() 0.00 1.01 0.00 5 0.00 0.00 cv::_InputArray::_InputArray(cv::Mat const&) 0.00 1.01 0.00 5 0.00 0.00 cv::String::String(char const*) 0.00 1.01 0.00 5 0.00 0.00 cv::String::~String() 0.00 1.01 0.00 4 0.00 0.00 cv::Mat::create(int, int, int) 0.00 1.01 0.00 4 0.00 0.00 cv::Mat::Mat(int, int, int) 0.00 1.01 0.00 4 0.00 0.00 Color3d::divide(double) 0.00 1.01 0.00 4 0.00 0.00 Color3d::Color3d(double, double, double) 0.00 1.01 0.00 2 0.00 0.00 cv::Mat::Mat() 0.00 1.01 0.00 2 0.00 0.00 cv::MatExpr::~MatExpr() 0.00 1.01 0.00 2 0.00 0.00 Color3d::operator-(Color3d const&) 0.00 1.01 0.00 2 0.00 0.00 cv::Mat::empty() const 0.00 1.01 0.00 2 0.00 0.00 cv::Mat::total() const 0.00 1.01 0.00 2 0.00 0.00 cv::MatExpr::operator cv::Mat() const 0.00 1.01 0.00 1 0.00 5.00 _GLOBAL__sub_I__ZmlRKN2cv3MatER7Color3d 0.00 1.01 0.00 1 0.00 5.00 __static_initialization_and_destruction_0(int, int) 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::big_any_policy<cv::String>::big_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::big_any_policy<cvflann::flann_algorithm_t>::big_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::big_any_policy<cvflann::flann_centers_init_t>::big_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::small_any_policy<char const*>::small_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::small_any_policy<bool>::small_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::small_any_policy<float>::small_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::small_any_policy<int>::small_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::small_any_policy<unsigned int>::small_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<cv::String>::typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<cv::String>::~typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<cvflann::anyimpl::empty_any>::typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<cvflann::anyimpl::empty_any>::~typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<cvflann::flann_algorithm_t>::typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<cvflann::flann_algorithm_t>::~typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<cvflann::flann_centers_init_t>::typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<cvflann::flann_centers_init_t>::~typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<char const*>::typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<char const*>::~typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<bool>::typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<bool>::~typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<float>::typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<float>::~typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<int>::typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<int>::~typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<unsigned int>::typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 cvflann::anyimpl::typed_base_any_policy<unsigned int>::~typed_base_any_policy() 0.00 1.01 0.00 1 0.00 0.00 __gnu_cxx::new_allocator<int>::new_allocator() 0.00 1.01 0.00 1 0.00 0.00 __gnu_cxx::new_allocator<int>::~new_allocator() 0.00 1.01 0.00 1 0.00 0.00 std::allocator<int>::allocator() 0.00 1.01 0.00 1 0.00 0.00 std::allocator<int>::~allocator() 0.00 1.01 0.00 1 0.00 0.00 void std::_Destroy_aux<true>::__destroy<int*>(int*, int*) 0.00 1.01 0.00 1 0.00 0.00 std::_Vector_base<int, std::allocator<int> >::_Vector_impl::_Vector_impl() 0.00 1.01 0.00 1 0.00 0.00 std::_Vector_base<int, std::allocator<int> >::_Vector_impl::~_Vector_impl() 0.00 1.01 0.00 1 0.00 0.00 std::_Vector_base<int, std::allocator<int> >::_M_deallocate(int*, unsigned long) 0.00 1.01 0.00 1 0.00 0.00 std::_Vector_base<int, std::allocator<int> >::_M_get_Tp_allocator() 0.00 1.01 0.00 1 0.00 0.00 std::_Vector_base<int, std::allocator<int> >::_Vector_base() 0.00 1.01 0.00 1 0.00 0.00 std::_Vector_base<int, std::allocator<int> >::~_Vector_base() 0.00 1.01 0.00 1 0.00 0.00 std::vector<int, std::allocator<int> >::vector() 0.00 1.01 0.00 1 0.00 0.00 std::vector<int, std::allocator<int> >::~vector() 0.00 1.01 0.00 1 0.00 0.00 void std::_Destroy<int*>(int*, int*) 0.00 1.01 0.00 1 0.00 0.00 void std::_Destroy<int*, int>(int*, int*, std::allocator<int>&)
Call Graph
Full Profile Data File
Since the profile data is too large a download of it in pdf format will also be included.
Download: Media:profile-data.pdf
Analysis
As seen in the call graph of the profile data 98% of the time spent in the program is spent processing in the main file. Within this main multiple calls to the Color3d overloaded operator is made (located in the Color3d file). This makes the program ideal for parallelization as many of the steps used to process the pixel colours can be delegated to the Nvidia GPU. This can be seen in the above code snippets (found in the link above) when calls to Color3d operator are done multiple times in a triple for loop.
Sorted array processing by Abbas Zoeb
Sorted array processing finds the processing time it takes for an array before and after it has been sorted. This code basically allocates an array with random numbers (array size is 32768). Then the array is sorted using the standard library inbuilt sorting algorithm (which most likely is using Quick Sort). Then another loop is run through the entire array to find all the elements that are above the number '127'. This loop is inside another loop that rotates for another 100,000 times. The loop would go by faster as the array is already sorted and would be much easier now to find elements greater than 127. Profiling was performed on this c++ code on matrix. The project can be found here: [1]
Running the Program
The file can be simply compiled using g++ or clang++ on matrix.
Steps to compile the code and do the profiling on it:
0) Copy the code from GitHub and paste it into a cpp file (I named it sorted.cpp)
1) g++ -Wall -pg sorted.cpp -o sorted
2) ./sorted
3) gprof sorted gmon.out > prof_output
4) cat prof_output
Now you can see the flat and graph profiling for the code.
The Code
#include <algorithm> #include <ctime> #include <iostream> int main() { // Generate data const unsigned arraySize = 32768; int data[arraySize]; for (unsigned c = 0; c < arraySize; ++c) data[c] = std::rand() % 256; // The next loop runs faster because we are using a sorted array std::sort(data, data + arraySize); // Test clock_t start = clock(); long long sum = 0; for (unsigned i = 0; i < 100000; ++i) { // Primary loop for (unsigned c = 0; c < arraySize; ++c) { if (data[c] >= 128) sum += data[c]; } } double elapsedTime = static_cast<double>(clock() - start) / CLOCKS_PER_SEC; std::cout << elapsedTime << std::endl; std::cout << "sum = " << sum << std::endl; }
Analysis
Here we can see the profiling as below:
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total time seconds seconds calls ms/call ms/call name 99.93 14.89 14.89 main 0.07 14.90 0.01 120348 0.00 0.00 void std::__iter_swap<true>::iter_swap<int*, int*>(int*, int*) 0.00 14.90 0.00 120348 0.00 0.00 void std::swap<int>(int&, int&) 0.00 14.90 0.00 120348 0.00 0.00 void std::iter_swap<int*, int*>(int*, int*) 0.00 14.90 0.00 32767 0.00 0.00 void std::__unguarded_linear_insert<int*>(int*) 0.00 14.90 0.00 2963 0.00 0.00 void std::__move_median_first<int*>(int*, int*, int*) 0.00 14.90 0.00 2963 0.00 0.00 int* std::__unguarded_partition<int*, int>(int*, int*, int const&) 0.00 14.90 0.00 2963 0.00 0.00 int* std::__unguarded_partition_pivot<int*>(int*, int*) 0.00 14.90 0.00 1 0.00 0.00 _GLOBAL__sub_I_main 0.00 14.90 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) 0.00 14.90 0.00 1 0.00 0.00 void std::__insertion_sort<int*>(int*, int*) 0.00 14.90 0.00 1 0.00 10.00 void std::__introsort_loop<int*, int>(int*, int*, int) 0.00 14.90 0.00 1 0.00 0.00 void std::__final_insertion_sort<int*>(int*, int*) 0.00 14.90 0.00 1 0.00 0.00 void std::__unguarded_insertion_sort<int*>(int*, int*) 0.00 14.90 0.00 1 0.00 0.00 std::__lg(int) 0.00 14.90 0.00 1 0.00 10.00 void std::sort<int*>(int*, int*)
Full profiling can be found here - http://pastebin.com/vByZLWtd
Index by function name
[14] _GLOBAL__sub_I_main [7] int* std::__unguarded_partition<int*, int>(int*, int*, int const&) [6] void std::sort<int*>(int*, int*) [15] __static_initialization_and_destruction_0(int, int) [17] void std::__final_insertion_sort<int*>(int*, int*) [12] void std::swap<int>(int&, int&) [2] void std::__iter_swap<true>::iter_swap<int*, int*>(int*, int*) [13] void std::__unguarded_linear_insert<int*>(int*) [3] void std::iter_swap<int*, int*>(int*, int*) [16] void std::__insertion_sort<int*>(int*, int*) [18] void std::__unguarded_insertion_sort<int*>(int*, int*) [1] main [5] void std::__introsort_loop<int*, int>(int*, int*, int) [4] int* std::__unguarded_partition_pivot<int*>(int*, int*) [8] void std::__move_median_first<int*>(int*, int*, int*) [19] std::__lg(int)
- The compilation of the code takes roughly 14 seconds and when the code is profiled we can see that the main uses most of the % processing time. i.e., 99.93%.
Final Statement
We plan to parallelize the "ColorTransfer" program as it yields better theoretical speedup of the execution of the whole task in comparison to Sudoku and Sorted Array Processing.
Assignment 2
Source : A2
Description
Removing CPU Bottleneck
The CPU bottleneck in the ColorTransfer/main.cpp:
// Multiplication of matrix and vector Color3d operator *(const cv::Mat& M, Color3d& v) { Color3d u = Color3d(); for(int i=0; i<3; i++) { u(i) = 0.0; for(int j=0; j<3; j++) { u(i) += M.at<double>(i, j) * v(j); } } return u; }
Added functions and changes
To fix this issue We wrote a device function which handles a matrix by vector multiplication. We also wrote a kernel which has equal logic to the host version so we could off load all the needed data to the kernel without needing to do multiple back and forth(s). We also wrote a helper function which will allocate all transfer all the opencv matrices to a suitable form for the kernel to deal with.
Matrix by vector
__device__ void matvec(float* d_A, float* d_B, float* d_C) { int tid = threadIdx.x + blockIdx.x * blockDim.x; float sum = 0; if (tid < 3) { for (int i = 0; i < 3; ++i) { sum += d_A[i] * d_B[(i * 3) + tid]; } d_C[tid] = sum; } }
Kernel
__global__ void matvec_kernel(float* d_A, float* d_RGB2, float* d_LMS2, float* d_C, const int n, int targetrows, int targetcols, float* d_Tar) { const double eps = 1.0e-4; for (int y = 0; y < targetrows; ++y) { for (int x = 0; x < targetcols; ++x) { memcpy(&d_A, &d_Tar[y * 3 + x], 3 * sizeof(float)); matvec(d_A, d_RGB2, d_C); memcpy(&d_A, d_C, 3 * sizeof(float)); for (int c = 0; c < 3; c++) d_A[c] = d_A[c] > -5.0 ? pow((double)10.0, (double)d_A[c]) : eps; matvec(d_A, d_LMS2, d_C); memcpy(&d_Tar[y * 3 + x], d_C, 3 * sizeof(float)); } } }
Helper
inline void vecTransfer(float* h, Color3d* v) { for (int j = 0; j < 3; ++j) h[j] = v->v[j]; } //KERNEL Helper function does setup and launch void matvec_L(cv::Mat* mRGB2LMS, cv::Mat* mLMS2lab, float* h_C, int tarrow, int tarcol, float* h_Tar) { float *h_A, *h_RGB2, *h_LMS2, *d_Tar; float *d_A, *d_RGB2, *d_LMS2, *d_C; int N = 3; h_A = (float*)malloc(sizeof(float) * N); h_RGB2 = new float[mRGB2LMS->total()]; h_LMS2 = new float[mLMS2lab->total()]; //h_C = (float*)malloc(sizeof(float) * N); cudaMalloc((void**)&d_A, sizeof(float) * N); cudaMalloc((void**)&d_RGB2, sizeof(float) * N * N); cudaMalloc((void**)&d_LMS2, sizeof(float) * N * N); cudaMalloc((void**)&d_C, sizeof(h_C)); cudaMalloc((void**)&d_Tar, sizeof(h_Tar)); Color3d vec; //copy vec and matrix to host pointers vecTransfer(h_A, &vec); memcpy(h_RGB2, mRGB2LMS->data, mRGB2LMS->total()); memcpy(h_LMS2, mLMS2lab->data, mLMS2lab->total()); cudaMemcpy(d_A, h_A, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_RGB2, h_RGB2, sizeof(float) * N * N, cudaMemcpyHostToDevice); cudaMemcpy(d_LMS2, h_LMS2, sizeof(float) * N * N, cudaMemcpyHostToDevice); cudaMemcpy(d_Tar, h_Tar, sizeof(h_Tar), cudaMemcpyHostToDevice); matvec_kernel << <N / BLOCK_SIZE + 1, BLOCK_SIZE >> >(d_A, d_RGB2, d_LMS2, d_C, N, tarrow, tarcol, d_Tar); //printf("error code: %s\n",cudaGetErrorString(cudaGetLastError())); cudaMemcpy(h_C, d_C, sizeof(h_C), cudaMemcpyDeviceToHost); free(h_A); free(h_RGB2); free(h_LMS2); //free(h_C); cudaFree(d_A); cudaFree(d_RGB2); cudaFree(d_LMS2); cudaFree(d_C); cudaFree(d_Tar); }
Changes to main loop
Old
for (int y = 0; y < target.rows; y++) { for (int x = 0; x < target.cols; x++) { v = target.at<Color3d>(y, x); matvec_L(&v, &mlab2LMS, h_C); memcpy(&v, h_C, N * sizeof(float)); for (int c = 0; c < 3; c++) v(c) = v(c) > -5.0 ? pow(10.0, v(c)) : eps; matvec_L(&v, &mLMS2RGB, h_C); memcpy(&target.at<Color3d>(y, x), h_C, N * sizeof(float)); } }
New
float* h_TARGET = (float *)malloc(sizeof(target.data)); memcpy(h_TARGET, target.data, sizeof(target.data)); matvec_L(&mlab2LMS, &mLMS2RGB, h_C, rows, cols, h_TARGET);
Results
For our profiling we had three test cases.
(Size is in pixels) Small : 400x400 by 400x400 Medium : 1400x989 by 1215x717 Large : 7362x4858 by 6000x4032
Assignment 3
Link to Original Unchanged Project and project used for a3
The file contains the visual studio solutions used for Assignment 3 as well as visual studio solution version of the original code.
To run the location of the OPENCV directory must be applied to the following project properties:
1. Under c/c++ -> general -> additional include directories -> the path to opencv\..\..\include 2. Under linker -> general -> additional library directories -> the path to opencv\..\..\lib
After doing the above, build the solution and run the appropriate Release or Debug exe with the target and reference image as arguments.
A link the the corresponding xls file of the run time can be found here
What was done
At first the power function used was switched out with __pow in the kernel as the traditional pow function is more heavy of a function. But the results were very small causing a different from around 10-30 milliseconds. Afterwards the kernel was upgraded to implement grid and strides. In doing so instead of doing all the calculations on one thread, many of them were able to be calculated on a separate row. This made it so that for every pixel with a row one thread would be responsible for the colour shift.
Other implementations were made to transfer all the data necessary for calculations all at once from the beginning and then perform all calculations done by tatsy on the device side, but due to time constraints and the complication of the project we were unable to fully implement these changes. The code is however left (commented) in the included project download.
Optimized Kernel
__global__ void matvec_kernel(float* d_A, float* d_RGB2, float* d_LMS2, float* d_C, const int n, int targetrows, int targetcols, float* d_Tar) { const double eps = 1.0e-4; //grid-stride loop for (int tid = threadIdx.x + blockIdx.x * blockDim.x; tid < targetrows; tid += blockDim.x * gridDim.x) { for (int x = 0; x < targetcols; ++x) { memcpy(&d_A, &d_Tar[tid * 3 + x], 3 * sizeof(float)); matvec(d_A, d_RGB2, d_C); memcpy(&d_A, d_C, 3 * sizeof(float)); for (int c = 0; c < 3; c++) d_A[c] = d_A[c] > -5.0 ? __powf(10.0f, d_A[c]) : eps; matvec(d_A, d_LMS2, d_C); memcpy(&d_Tar[tid * 3 + x], d_C, 3 * sizeof(float)); } } }
Results
When running in release and comparing the results to the original unchanged project straight from tatsy we noticed that for very small images the cuda version is slightly slower. This is probably due to the actual conversion of colour on a smaller image to be much shorter, whereas the transferring of the data over to device memory itself may add additional time. Overall when comparing the optimized and unoptimized versions of the kernel there was a visible increase.