Difference between revisions of "The parallelizing Express"

From CDOT Wiki
Jump to: navigation, search
(Assignment 2)
(Results)
 
(19 intermediate revisions by 2 users not shown)
Line 600: Line 600:
  
 
== Assignment 2 ==
 
== Assignment 2 ==
 +
 +
Source : [https://www.dropbox.com/s/5tf3jial3rg9d4k/a2.7z?dl=0 A2]
  
 
=== Description ===
 
=== Description ===
 
'''Removing CPU Bottleneck'''
 
'''Removing CPU Bottleneck'''
  
Removing the old CPU bottleneck in the ColorTransfer/main.cpp:
+
The CPU bottleneck in the ColorTransfer/main.cpp:
  
 
<pre>
 
<pre>
Line 622: Line 624:
 
''' Added functions and changes'''
 
''' Added functions and changes'''
  
We wrote a matrix by vector multiplication and made a few minor adjustments to the main loop that deals with the color shift for the target (image to be modified).
+
To fix this issue We wrote a device function which handles a matrix by vector multiplication. We also wrote a kernel which has equal logic to the host version so we could off load all the needed data to the kernel without needing to do multiple back and forth(s). We also wrote a helper function which will allocate all transfer all the opencv matrices to a suitable form for the kernel to deal with.
  
 
Matrix by vector
 
Matrix by vector
 
<pre>
 
<pre>
void sgemv(const float* h_A, const float* h_B, float* h_C, int n) {
+
__device__ void matvec(float* d_A, float* d_B, float* d_C)
     // level 3 calculation: C = alpha * A * B + beta * C
+
{
    float* devPtrA;
+
     int tid = threadIdx.x + blockIdx.x * blockDim.x;
    float* devPtrB;
+
 
     float* devPtrC;
+
     float sum = 0;
     // ... allocate memory on the device
+
     if (tid < 3)
    cudaMalloc((void**)&devPtrA, n * n * sizeof(float));
+
     {
     cudaMalloc((void**)&devPtrB, n * sizeof(float));
+
        for (int i = 0; i < 3; ++i)
    cudaMalloc((void**)&devPtrC, n * sizeof(float));
+
        {
    // ... create cuBLAS context
+
            sum += d_A[i] * d_B[(i * 3) + tid];
    cublasHandle_t handle;
+
        }
    cublasStatus_t status;
+
 
    status = cublasCreate(&handle);
+
         d_C[tid] = sum;
    if (status != CUBLAS_STATUS_SUCCESS) {
 
        std::cerr << "***cublasCreate failed***\n";
 
         return;
 
 
     }
 
     }
    // ... copy host matrices to the device
+
}
    status = cublasSetMatrix(n, n, sizeof(float), h_A, n, devPtrA, n);
+
</pre>
     if (status != CUBLAS_STATUS_SUCCESS) {
+
 
         std::cerr << "***cublasSetMatrix A failed***\n";
+
Kernel
         return;
+
<pre>
 +
__global__ void matvec_kernel(float* d_A, float* d_RGB2, float* d_LMS2, float* d_C,
 +
    const int n, int targetrows, int targetcols, float* d_Tar)
 +
{
 +
    const double eps = 1.0e-4;
 +
     for (int y = 0; y < targetrows; ++y) {
 +
         for (int x = 0; x < targetcols; ++x) {
 +
            memcpy(&d_A, &d_Tar[y * 3 + x], 3 * sizeof(float));
 +
 
 +
            matvec(d_A, d_RGB2, d_C);
 +
            memcpy(&d_A, d_C, 3 * sizeof(float));
 +
 
 +
            for (int c = 0; c < 3; c++)
 +
                d_A[c] = d_A[c] > -5.0 ? pow((double)10.0, (double)d_A[c]) : eps;
 +
 
 +
            matvec(d_A, d_LMS2, d_C);
 +
            memcpy(&d_Tar[y * 3 + x], d_C, 3 * sizeof(float));
 +
         }
 
     }
 
     }
     status = cublasSetMatrix(n sizeof(float), h_B, n, devPtrB, n);
+
}
     if (status != CUBLAS_STATUS_SUCCESS) {
+
</pre>
        std::cerr << "***cublasSetVector B failed***\n";
+
 
        return;
+
Helper
     }
+
<pre>
     // ... calculate matrix-vector product
+
inline void vecTransfer(float* h, Color3d* v)
     int ld_d_A = n;
+
{
     int ld_d_B = n;
+
     for (int j = 0; j < 3; ++j)
     int ld_d_C = n;
+
        h[j] = v->v[j];
 +
}
 +
 
 +
//KERNEL Helper function does setup and launch
 +
void matvec_L(cv::Mat* mRGB2LMS, cv::Mat* mLMS2lab, float* h_C, int tarrow, int tarcol, float* h_Tar)
 +
{
 +
    float *h_A, *h_RGB2, *h_LMS2, *d_Tar;
 +
    float *d_A, *d_RGB2, *d_LMS2, *d_C;
 +
 
 +
    int N = 3;
 +
 
 +
    h_A = (float*)malloc(sizeof(float) * N);
 +
    h_RGB2 = new float[mRGB2LMS->total()];
 +
     h_LMS2 = new float[mLMS2lab->total()];
 +
    //h_C = (float*)malloc(sizeof(float) * N);
 +
 
 +
    cudaMalloc((void**)&d_A, sizeof(float) * N);
 +
    cudaMalloc((void**)&d_RGB2, sizeof(float) * N * N);
 +
    cudaMalloc((void**)&d_LMS2, sizeof(float) * N * N);
 +
    cudaMalloc((void**)&d_C, sizeof(h_C));
 +
    cudaMalloc((void**)&d_Tar, sizeof(h_Tar));
 +
 
 +
     Color3d vec;
 +
 
 +
     //copy vec and matrix to host pointers
 +
    vecTransfer(h_A, &vec);
 +
    memcpy(h_RGB2, mRGB2LMS->data, mRGB2LMS->total());
 +
    memcpy(h_LMS2, mLMS2lab->data, mLMS2lab->total());
 +
 
 +
    cudaMemcpy(d_A, h_A, sizeof(float) * N, cudaMemcpyHostToDevice);
 +
    cudaMemcpy(d_RGB2, h_RGB2, sizeof(float) * N * N, cudaMemcpyHostToDevice);
 +
    cudaMemcpy(d_LMS2, h_LMS2, sizeof(float) * N * N, cudaMemcpyHostToDevice);
 +
    cudaMemcpy(d_Tar, h_Tar, sizeof(h_Tar), cudaMemcpyHostToDevice);
 +
 
 +
    matvec_kernel << <N / BLOCK_SIZE + 1, BLOCK_SIZE >> >(d_A, d_RGB2, d_LMS2, d_C, N, tarrow, tarcol, d_Tar);
 +
     //printf("error code: %s\n",cudaGetErrorString(cudaGetLastError()));
 +
 
 +
    cudaMemcpy(h_C, d_C, sizeof(h_C), cudaMemcpyDeviceToHost);
 +
 
 +
    free(h_A);
 +
     free(h_RGB2);
 +
    free(h_LMS2);
 +
     //free(h_C);
  
     float alpha = 1.0f;
+
     cudaFree(d_A);
    float beta = 0.0f;
+
     cudaFree(d_RGB2);
    status = cublasSgemv(handle, CUBLAS_OP_N, n, n,
+
     cudaFree(d_LMS2);
                        &alpha, devPtrA, ld_d_A, devPtrB, ld_d_B, &beta, devPtrC, ld_d_C);
+
     cudaFree(d_C);
     if (status != CUBLAS_STATUS_SUCCESS) {
+
     cudaFree(d_Tar);
        std::cerr << "***cublasSgemm failed***\n";
 
        return;
 
    }
 
    // ... copy result matrix from the device to the host
 
    status = cublasGetVector(n, sizeof(float), devPtrC, n, h_C, n);
 
    if (status != CUBLAS_STATUS_SUCCESS) {
 
        std::cerr << "***cublasGetVector C failed***\n";
 
        return;
 
    }
 
    // ... destroy cuBLAS context
 
    cublasDestroy(handle);
 
    // ... deallocate device memory
 
     cudaFree(&h_A);
 
     cudaFree(&h_B);
 
     cudaFree(&h_C);
 
 
}
 
}
 
</pre>
 
</pre>
 +
  
 
Changes to main loop
 
Changes to main loop
Line 686: Line 730:
 
Old
 
Old
 
<pre>
 
<pre>
// Transform back from lab to RGB
+
for (int y = 0; y < target.rows; y++) {
for(int y=0; y<target.rows; y++) {
+
    for (int x = 0; x < target.cols; x++) {
for(int x=0; x<target.cols; x++) {
+
      v = target.at<Color3d>(y, x);
v = target.at<Color3d>(y, x);
+
 
v = mlab2LMS * v;
+
      matvec_L(&v, &mlab2LMS, h_C);
for(int c=0; c<3; c++) v(c) = v(c) > -5.0 ? pow(10.0, v(c)) : eps;
+
      memcpy(&v, h_C, N * sizeof(float));
 +
 
 +
      for (int c = 0; c < 3; c++)
 +
      v(c) = v(c) > -5.0 ? pow(10.0, v(c)) : eps;
  
target.at<Color3d>(y, x) = mLMS2RGB * v;
+
      matvec_L(&v, &mLMS2RGB, h_C);
}
+
      memcpy(&target.at<Color3d>(y, x), h_C, N * sizeof(float));
 +
  }
 
}
 
}
 
</pre>
 
</pre>
Line 700: Line 748:
 
New
 
New
 
<pre>
 
<pre>
// allocate host memory
+
    float* h_TARGET = (float *)malloc(sizeof(target.data));
float* h_C = new float[3]; // result
+
    memcpy(h_TARGET, target.data, sizeof(target.data));
 +
    matvec_L(&mlab2LMS, &mLMS2RGB, h_C, rows, cols, h_TARGET);
 +
</pre>
 +
 
 +
=== Results ===
 +
 
 +
For our profiling we had three test cases.
 +
<pre>
 +
(Size is in pixels)
 +
Small : 400x400 by 400x400
 +
Medium : 1400x989 by 1215x717
 +
Large : 7362x4858 by 6000x4032
 +
</pre>
  
// Transform back from lab to RGB
+
[[File:a2timings.PNG]]
for(int y=0; y<target.rows; y++) {
 
    for(int x=0; x<target.cols; x++) {
 
        v = target.at<Color3d>(y, x);
 
        sgemv(&mlab2LMS, v, h_c);
 
        memcpy(v, h_c, sizeof(Color3d));
 
  
        for(int c=0; c<3; c++)
+
== Assignment 3 ==
            v(c) = v(c) > -5.0 ? pow(10.0, v(c)) : eps;
 
  
        sgemv(&mLMS2RGB , v, h_c);
+
=== Link to Original Unchanged Project and project used for a3 ===
        memcpy(target.at<Color3d>(y, x), h_c, sizeof(Color3d));
+
 
    }
+
[https://www.dropbox.com/s/32q70e9iovpm80l/a2.7z?dl=0 Download]
 +
 
 +
The file contains the visual studio solutions used for Assignment 3 as well as visual studio solution version of the original code.
 +
 
 +
To run the location of the OPENCV directory must be applied to the following project properties:
 +
 
 +
1. Under c/c++ -> general -> additional include directories -> the path to opencv\..\..\include
 +
2. Under linker -> general -> additional library directories -> the path to opencv\..\..\lib
 +
 
 +
After doing the above, build the solution and run the appropriate Release or Debug exe with the target and reference image as arguments.
 +
 
 +
A link the the corresponding xls file of the run time can be found [https://www.dropbox.com/s/ecu7eycyv41krwq/a3.xlsx?dl=0 here]
 +
 
 +
=== What was done ===
 +
 
 +
At first the power function used was switched out with __pow in the kernel as the traditional pow function is more heavy of a function. But the results were very small causing a different from around 10-30 milliseconds. Afterwards the kernel was upgraded to implement grid and strides. In doing so instead of doing all the calculations on one thread, many of them were able to be calculated on a separate row. This made it so that for every pixel with a row one thread would be responsible for the colour shift.
 +
 
 +
Other implementations were made to transfer all the data necessary for calculations all at once from the beginning and then perform all calculations done by tatsy on the device side, but due to time constraints and the complication of the project we were unable to fully implement these changes. The code is however left (commented) in the included project download.
 +
 
 +
=== Optimized Kernel ===
 +
<pre>
 +
__global__ void matvec_kernel(float* d_A, float* d_RGB2, float* d_LMS2, float* d_C,
 +
const int n, int targetrows, int targetcols, float* d_Tar)
 +
{
 +
const double eps = 1.0e-4;
 +
//grid-stride loop
 +
for (int tid = threadIdx.x + blockIdx.x * blockDim.x;
 +
tid < targetrows;
 +
tid += blockDim.x * gridDim.x)
 +
{
 +
for (int x = 0; x < targetcols; ++x) {
 +
memcpy(&d_A, &d_Tar[tid * 3 + x], 3 * sizeof(float));
 +
 
 +
matvec(d_A, d_RGB2, d_C);
 +
memcpy(&d_A, d_C, 3 * sizeof(float));
 +
 
 +
for (int c = 0; c < 3; c++)
 +
d_A[c] = d_A[c] > -5.0 ? __powf(10.0f, d_A[c]) : eps;
 +
 
 +
matvec(d_A, d_LMS2, d_C);
 +
memcpy(&d_Tar[tid * 3 + x], d_C, 3 * sizeof(float));
 +
}
 +
}
 
}
 
}
 
</pre>
 
</pre>
 +
 +
=== Results ===
 +
 +
[[File:a3timings.PNG]]
 +
 +
 +
When running in release and comparing the results to the original unchanged project straight from tatsy we noticed that for very small images the cuda version is slightly slower. This is probably due to the actual conversion of colour on a smaller image to be much shorter, whereas the transferring of the data over to device memory itself may add additional time. Overall when comparing the optimized and unoptimized versions of the kernel there was a visible increase.

Latest revision as of 19:12, 13 April 2017

The parallelizing Express

          o x o x o x o . . .
        o      _____            _______________ ___=====__T___
      .][__n_n_|DD[  ====_____  |    |.\/.|   | |   |_|     |_
     >(________|__|_[_________]_|____|_/\_|___|_|___________|_|
     _/oo OOOOO oo`  ooo   ooo   o^o       o^o   o^o     o^o

-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-


Repository

Team Members

  1. Abbas Zoeb
  2. Jadrian Sunga
  3. Marko Radmanovic
Email All

Progress

Assignment 1

SudokuMP by Marko R

SudokuMP solves sudoku puzzles given a txt file arranging the puzzle and the size of the puzzle. There are two methods to solve said puzzles the brute force method and the humanistic solver. Profiling wasn't performed on the humanistic solver since it manages to solve all the 16x16 puzzles extremely quickly the worst of them (expert) still being at around 12ms. The expert puzzle is too difficult for the brute force method and takes over an hour to run so it wasn't profiled. The project can be found here: SudokuMP

Running the Program

The git repository comes with a pre-compiled version of the program so you can run it immediately (you might need to give it the correct permissions to run). The git repository also comes with a make file so if you want to compile your own version delete the pre-compiled version and run make. The program takes two command line arguments the first being the path to the txt file containing the sudoku puzzle and the second argument being the size of the puzzle cells.

16x16 easy 1

Command line: ./sudoku src/16x16/easy1.txt 16

   Before :                                                  After :
   00 00 00 00 | 00 00 00 08 | 00 00 07 00 | 00 05 04 00     11 01 09 10 | 06 13 03 08 | 02 14 07 12 | 16 05 04 15  
   00 00 04 00 | 00 02 16 00 | 01 05 00 11 | 09 12 00 00     06 15 04 13 | 10 02 16 14 | 01 05 03 11 | 09 12 08 07  
   00 08 00 07 | 05 00 15 11 | 09 04 16 00 | 02 00 00 00     14 08 03 07 | 05 12 15 11 | 09 04 16 06 | 02 10 13 01  
   05 00 12 16 | 01 09 00 00 | 00 13 00 00 | 00 00 00 06     05 02 12 16 | 01 09 07 04 | 08 13 10 15 | 03 11 14 06
   -----------------------------------------------------     -----------------------------------------------------
   15 00 10 00 | 00 00 02 00 | 00 00 05 00 | 00 00 16 03     15 11 10 09 | 08 07 02 12 | 13 01 05 04 | 14 06 16 03  
   00 07 00 00 | 00 00 00 10 | 06 15 08 00 | 00 09 02 00     12 07 14 03 | 13 05 04 10 | 06 15 08 16 | 01 09 02 11  
   01 00 13 00 | 14 00 00 00 | 10 07 00 09 | 00 08 00 05     01 16 13 04 | 14 06 11 03 | 10 07 02 09 | 12 08 15 05  
   08 06 00 00 | 00 00 01 00 | 14 00 00 03 | 13 04 00 10     08 06 02 05 | 09 15 01 16 | 14 12 11 03 | 13 04 07 10  
   -----------------------------------------------------     -----------------------------------------------------
   13 00 16 06 | 15 00 00 09 | 00 11 00 00 | 00 00 01 02     13 14 16 06 | 15 10 05 09 | 04 11 12 08 | 07 03 01 02  
   03 00 08 00 | 12 00 06 01 | 00 00 00 07 | 00 16 00 13     03 10 08 11 | 12 04 06 01 | 15 02 14 07 | 05 16 09 13  
   00 05 01 00 | 00 14 08 07 | 16 00 00 00 | 00 00 12 00     09 05 01 02 | 03 14 08 07 | 16 06 13 10 | 11 15 12 04  
   07 04 00 00 | 00 11 00 00 | 00 09 00 00 | 00 14 00 08     07 04 15 12 | 16 11 13 02 | 03 09 01 05 | 10 14 06 08  
   -----------------------------------------------------     -----------------------------------------------------
   16 00 00 00 | 00 00 12 00 | 00 00 06 01 | 15 02 00 09     16 13 07 14 | 04 03 12 05 | 11 08 06 01 | 15 02 10 09  
   00 00 00 01 | 00 08 10 06 | 12 16 00 13 | 04 00 03 00     02 09 05 01 | 11 08 10 06 | 12 16 15 13 | 04 07 03 14  
   00 00 11 08 | 02 00 09 15 | 00 03 04 00 | 00 13 00 00     10 12 11 08 | 02 01 09 15 | 07 03 04 14 | 06 13 05 16  
   00 03 06 00 | 00 16 00 00 | 05 00 00 00 | 00 00 00 00     04 03 06 15 | 07 16 14 13 | 05 10 09 02 | 08 01 11 12  


NOTE : The profiles for this project were quite large so only the most used functions are listed here. The entire profile file can be find through the links provided above the respective flat profile.

Link to full profile : FULL

Flat Profile :

Each sample counts as 0.01 seconds.
no time accumulated
 %   cumulative   self              self     total           
time   seconds   seconds    calls   s/call   s/call  name    
65.87      0.81     0.81    59789     0.00     0.00  create_copy_board(Board*)
21.14      1.07     0.26    59777     0.00     0.00  choose_cell_bf(Board*, int&, int&)
 6.51      1.15     0.08    59790     0.00     0.00  Board::Board(int)
 6.51      1.23     0.08    59777     0.00     0.00  Board::~Board()
 0.00      1.23     0.00   179709     0.00     0.00  clear_number(Board*, int, Align, int)
 0.00      1.23     0.00   179370     0.00     0.00  Board* const& std::forward<Board* const&>(std::remove_reference<Board* const&>::type&)
 0.00      1.23     0.00    59903     0.00     0.00  update_solution(Board*, int, int, int)
 0.00      1.23     0.00    59790     0.00     0.00  _ZN9__gnu_cxx13new_allocatorIP5BoardE9constructIS2_IRKS2_EEEvPT_DpOT0_
 0.00      1.23     0.00    59790     0.00     0.00  std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__construct_helper<Board*, Board* const&>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&)
 0.00      1.23     0.00    59790     0.00     0.00  decltype (_S_construct({parm#1}, {parm#2}, (forward<Board* const&>)({parm#3}))) std::allocator_traits<std::allocator<Board*> >::construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&)
 0.00      1.23     0.00    59790     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::push_back(Board* const&)
 0.00      1.23     0.00    59790     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::push(Board* const&)
 0.00      1.23     0.00    59790     0.00     0.00  operator new(unsigned long, void*)
 0.00      1.23     0.00    59784     0.00     0.00  std::_Deque_iterator<Board*, Board*&, Board**>::_Deque_iterator(std::_Deque_iterator<Board*, Board*&, Board**> const&)
 0.00      1.23     0.00    59780     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::end()
 0.00      1.23     0.00    59778     0.00     0.00  void __gnu_cxx::new_allocator<Board*>::destroy<Board*>(Board**)
 0.00      1.23     0.00    59778     0.00     0.00  std::_Deque_iterator<Board*, Board*&, Board**>::operator*() const
 0.00      1.23     0.00    59778     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::empty() const
 0.00      1.23     0.00    59778     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::empty() const
 0.00      1.23     0.00    59778     0.00     0.00  std::_Deque_iterator<Board*, Board*&, Board**>::operator--()
 0.00      1.23     0.00    59778     0.00     0.00  std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__destroy_helper<Board*>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_destroy<Board*>(std::allocator<Board*>&, Board**)
 0.00      1.23     0.00    59778     0.00     0.00  void std::allocator_traits<std::allocator<Board*> >::destroy<Board*>(std::allocator<Board*>&, Board**)
 0.00      1.23     0.00    59778     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::back()
 0.00      1.23     0.00    59778     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::pop_back()
 0.00      1.23     0.00    59778     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::pop()
 0.00      1.23     0.00    59778     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::top()
 0.00      1.23     0.00    59778     0.00     0.00  bool std::operator==<Board*, Board*&, Board**>(std::_Deque_iterator<Board*, Board*&, Board**> const&, std::_Deque_iterator<Board*, Board*&, Board**> const&)
 0.00      1.23     0.00    59777     0.00     0.00  update_stack(Board*)
 0.00      1.23     0.00       24     0.00     0.00  std::__deque_buf_size(unsigned long)

16x16 medium 1

Command line: ./sudoku src/16x16/medium1.txt 16

   Before :                                                  After :
   00 00 06 15 | 00 12 09 04 | 11 02 07 16 | 00 03 00 13     05 01 06 15 | 08 12 09 04 | 11 02 07 16 | 14 03 10 13  
   00 10 00 00 | 01 00 00 15 | 12 00 08 00 | 00 02 00 00     14 10 13 03 | 01 05 06 15 | 12 09 08 04 | 16 02 11 07  
   07 11 16 02 | 00 00 00 03 | 05 15 01 06 | 09 00 00 00     07 11 16 02 | 13 10 14 03 | 05 15 01 06 | 09 04 12 08  
   00 12 00 00 | 07 11 00 00 | 00 03 13 14 | 06 15 05 00     08 12 09 04 | 07 11 16 02 | 10 03 13 14 | 06 15 05 01  
   -----------------------------------------------------     -----------------------------------------------------
   00 06 12 00 | 00 00 11 00 | 00 00 03 10 | 00 13 14 15     04 06 12 01 | 02 09 11 08 | 16 07 03 10 | 05 13 14 15  
   00 00 10 00 | 00 14 00 00 | 00 01 00 00 | 00 00 09 00     03 16 10 07 | 15 14 05 13 | 06 01 04 12 | 11 08 09 02  
   02 09 00 00 | 03 00 00 00 | 14 00 15 05 | 12 00 06 00     02 09 11 08 | 03 16 10 07 | 14 13 15 05 | 12 01 06 04  
   15 00 00 13 | 00 06 12 00 | 09 00 00 00 | 10 07 00 03     15 05 14 13 | 04 06 12 01 | 09 08 11 02 | 10 07 16 03  
   -----------------------------------------------------     -----------------------------------------------------
   10 02 00 00 | 05 03 00 14 | 15 06 00 01 | 08 09 00 11     10 02 07 16 | 05 03 13 14 | 15 06 12 01 | 08 09 04 11  
   00 00 00 14 | 00 00 01 06 | 04 00 00 00 | 07 00 00 00     12 13 03 14 | 09 15 01 06 | 04 11 10 08 | 07 16 02 05  
   11 00 08 09 | 00 00 07 00 | 00 14 00 00 | 00 06 15 12     11 04 08 09 | 10 02 07 16 | 03 14 05 13 | 01 06 15 12  
   00 15 01 00 | 11 00 08 00 | 00 00 00 07 | 00 00 03 00     06 15 01 05 | 11 04 08 12 | 02 16 09 07 | 13 14 03 10  
   -----------------------------------------------------     -----------------------------------------------------
   16 00 02 00 | 14 00 00 10 | 13 05 00 15 | 04 00 01 00     16 07 02 11 | 14 08 03 10 | 13 05 06 15 | 04 12 01 09  
   00 00 00 00 | 00 00 00 00 | 08 00 00 00 | 00 00 07 14     01 03 15 06 | 12 13 04 09 | 08 10 16 11 | 02 05 07 14  
   00 00 00 00 | 16 00 00 00 | 00 00 14 03 | 00 00 00 00     09 08 05 12 | 16 01 02 11 | 07 04 14 03 | 15 10 13 06  
   00 00 00 00 | 06 00 15 05 | 01 12 00 00 | 00 11 08 00     13 14 04 10 | 06 07 15 05 | 01 12 02 09 | 03 11 08 16  

Link to full profile : FULL

Flat Profile :

Each sample counts as 0.01 seconds.
 %   cumulative   self              self     total           
time   seconds   seconds    calls  ms/call  ms/call  name    
70.23      0.33     0.33    23099     0.01     0.02  create_copy_board(Board*)
14.90      0.40     0.07    23083     0.00     0.00  choose_cell_bf(Board*, int&, int&)
10.64      0.45     0.05    23100     0.00     0.00  Board::Board(int)
 4.26      0.47     0.02    23083     0.00     0.00  Board::~Board()
 0.00      0.47     0.00    69645     0.00     0.00  clear_number(Board*, int, Align, int)
 0.00      0.47     0.00    69300     0.00     0.00  Board* const& std::forward<Board* const&>(std::remove_reference<Board* const&>::type&)
 0.00      0.47     0.00    23215     0.00     0.00  update_solution(Board*, int, int, int)
 0.00      0.47     0.00    23100     0.00     0.00  _ZN9__gnu_cxx13new_allocatorIP5BoardE9constructIS2_IRKS2_EEEvPT_DpOT0_
 0.00      0.47     0.00    23100     0.00     0.00  std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__construct_helper<Board*, Board* const&>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&)
 0.00      0.47     0.00    23100     0.00     0.00  decltype (_S_construct({parm#1}, {parm#2}, (forward<Board* const&>)({parm#3}))) std::allocator_traits<std::allocator<Board*> >::construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&)
 0.00      0.47     0.00    23100     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::push_back(Board* const&)
 0.00      0.47     0.00    23100     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::push(Board* const&)
 0.00      0.47     0.00    23100     0.00     0.00  operator new(unsigned long, void*)
 0.00      0.47     0.00    23090     0.00     0.00  std::_Deque_iterator<Board*, Board*&, Board**>::_Deque_iterator(std::_Deque_iterator<Board*, Board*&, Board**> const&)
 0.00      0.47     0.00    23086     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::end()
 0.00      0.47     0.00    23084     0.00     0.00  void __gnu_cxx::new_allocator<Board*>::destroy<Board*>(Board**)
 0.00      0.47     0.00    23084     0.00     0.00  std::_Deque_iterator<Board*, Board*&, Board**>::operator*() const
 0.00      0.47     0.00    23084     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::empty() const
 0.00      0.47     0.00    23084     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::empty() const
 0.00      0.47     0.00    23084     0.00     0.00  std::_Deque_iterator<Board*, Board*&, Board**>::operator--()
 0.00      0.47     0.00    23084     0.00     0.00  std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__destroy_helper<Board*>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_destroy<Board*>(std::allocator<Board*>&, Board**)
 0.00      0.47     0.00    23084     0.00     0.00  void std::allocator_traits<std::allocator<Board*> >::destroy<Board*>(std::allocator<Board*>&, Board**)
 0.00      0.47     0.00    23084     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::back()
 0.00      0.47     0.00    23084     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::pop_back()
 0.00      0.47     0.00    23084     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::pop()
 0.00      0.47     0.00    23084     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::top()
 0.00      0.47     0.00    23084     0.00     0.00  bool std::operator==<Board*, Board*&, Board**>(std::_Deque_iterator<Board*, Board*&, Board**> const&, std::_Deque_iterator<Board*, Board*&, Board**> const&)
 0.00      0.47     0.00    23083     0.00     0.02  update_stack(Board*)
 0.00      0.47     0.00       24     0.00     0.00  std::__deque_buf_size(unsigned long)


16x16 hard 1

Command line: ./sudoku src/16x16/hard1.txt 16

   Before :                                                  After :
   02 00 00 00 | 00 00 00 16 | 00 06 00 00 | 00 00 00 05     02 07 14 03 | 09 15 13 16 | 04 06 10 08 | 01 11 12 05  
   00 10 08 00 | 00 00 00 00 | 00 00 00 00 | 00 13 15 00     04 10 08 06 | 05 01 11 12 | 02 07 14 03 | 16 13 15 09  
   00 01 12 05 | 00 00 00 00 | 00 00 00 00 | 10 04 08 00     11 01 12 05 | 03 07 02 14 | 13 09 16 15 | 10 04 08 06  
   13 00 15 09 | 06 00 00 00 | 00 00 01 12 | 07 02 00 00     13 16 15 09 | 06 08 04 10 | 11 05 01 12 | 07 02 14 03  
   -----------------------------------------------------     -----------------------------------------------------
   00 00 00 08 | 12 11 00 00 | 00 00 03 02 | 09 00 00 00     01 06 04 08 | 12 11 07 05 | 16 14 03 02 | 09 10 13 15  
   16 00 00 00 | 15 13 10 00 | 00 08 06 04 | 00 00 00 00     16 03 02 14 | 15 13 10 09 | 01 08 06 04 | 05 07 11 12  
   00 00 13 00 | 00 04 01 06 | 07 12 05 00 | 00 00 00 00     10 09 13 15 | 08 04 01 06 | 07 12 05 11 | 03 16 02 14  
   00 00 00 00 | 14 02 16 03 | 10 15 09 00 | 00 00 00 00     07 05 11 12 | 14 02 16 03 | 10 15 09 13 | 06 01 04 08  
   -----------------------------------------------------     -----------------------------------------------------
   15 00 00 16 | 10 00 00 00 | 00 00 04 05 | 00 00 00 07     15 02 09 16 | 10 06 08 13 | 12 01 04 05 | 11 14 03 07  
   00 11 00 07 | 00 00 15 00 | 00 10 00 06 | 04 00 05 01     14 11 03 07 | 16 09 15 02 | 08 10 13 06 | 04 12 05 01  
   12 00 05 01 | 00 03 14 00 | 00 16 02 00 | 13 08 00 10     12 04 05 01 | 07 03 14 11 | 15 16 02 09 | 13 08 06 10  
   00 13 06 00 | 00 00 00 04 | 14 00 00 00 | 02 15 09 00     08 13 06 10 | 01 05 12 04 | 14 03 11 07 | 02 15 09 16  
   -----------------------------------------------------     -----------------------------------------------------
   03 00 07 11 | 00 16 00 00 | 00 00 15 00 | 08 05 00 04     03 12 07 11 | 02 16 09 01 | 06 13 15 14 | 08 05 10 04  
   00 00 00 13 | 00 00 05 08 | 03 11 00 00 | 14 00 00 00     06 15 01 13 | 04 12 05 08 | 03 11 07 10 | 14 09 16 02  
   00 00 00 04 | 11 00 00 00 | 00 00 00 16 | 15 00 00 00     05 08 10 04 | 11 14 03 07 | 09 02 12 16 | 15 06 01 13  
   00 00 00 00 | 13 10 06 15 | 05 04 08 01 | 00 00 00 00     09 14 16 02 | 13 10 06 15 | 05 04 08 01 | 12 03 07 11  

Link to full profile : FULL

Flat Profile :

 Each sample counts as 0.01 seconds.
 %   cumulative   self              self     total          
time   seconds   seconds    calls   s/call   s/call  name    
66.17      5.94     5.94   432968     0.00     0.00  create_copy_board(Board*)
22.28      7.94     2.00   432950     0.00     0.00  choose_cell_bf(Board*, int&, int&)
 6.02      8.48     0.54   432969     0.00     0.00  Board::Board(int)
 4.46      8.88     0.40   432950     0.00     0.00  Board::~Board()
 0.67      8.94     0.06  1299222     0.00     0.00  clear_number(Board*, int, Align, int)
 0.22      8.96     0.02   433074     0.00     0.00  update_solution(Board*, int, int, int)
 0.22      8.98     0.02   432950     0.00     0.00  update_stack(Board*)
 0.00      8.98     0.00  1298907     0.00     0.00  Board* const& std::forward<Board* const&>(std::remove_reference<Board* const&>::type&)
 0.00      8.98     0.00   432969     0.00     0.00  _ZN9__gnu_cxx13new_allocatorIP5BoardE9constructIS2_IRKS2_EEEvPT_DpOT0_
 0.00      8.98     0.00   432969     0.00     0.00  std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__construct_helper<Board*, Board* const&>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&)
 0.00      8.98     0.00   432969     0.00     0.00  decltype (_S_construct({parm#1}, {parm#2}, (forward<Board* const&>)({parm#3}))) std::allocator_traits<std::allocator<Board*> >::construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&)
 0.00      8.98     0.00   432969     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::push_back(Board* const&)
 0.00      8.98     0.00   432969     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::push(Board* const&)
 0.00      8.98     0.00   432969     0.00     0.00  operator new(unsigned long, void*)
 0.00      8.98     0.00   432957     0.00     0.00  std::_Deque_iterator<Board*, Board*&, Board**>::_Deque_iterator(std::_Deque_iterator<Board*, Board*&, Board**> const&)
 0.00      8.98     0.00   432953     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::end()
 0.00      8.98     0.00   432951     0.00     0.00  void __gnu_cxx::new_allocator<Board*>::destroy<Board*>(Board**)
 0.00      8.98     0.00   432951     0.00     0.00  std::_Deque_iterator<Board*, Board*&, Board**>::operator*() const
 0.00      8.98     0.00   432951     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::empty() const
 0.00      8.98     0.00   432951     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::empty() const
 0.00      8.98     0.00   432951     0.00     0.00  std::_Deque_iterator<Board*, Board*&, Board**>::operator--()
 0.00      8.98     0.00   432951     0.00     0.00  std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__destroy_helper<Board*>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_destroy<Board*>(std::allocator<Board*>&, Board**)
 0.00      8.98     0.00   432951     0.00     0.00  void std::allocator_traits<std::allocator<Board*> >::destroy<Board*>(std::allocator<Board*>&, Board**)
 0.00      8.98     0.00   432951     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::back()
 0.00      8.98     0.00   432951     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::pop_back()
 0.00      8.98     0.00   432951     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::pop()
 0.00      8.98     0.00   432951     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::top()
 0.00      8.98     0.00   432951     0.00     0.00  bool std::operator==<Board*, Board*&, Board**>(std::_Deque_iterator<Board*, Board*&, Board**> const&, std::_Deque_iterator<Board*, Board*&, Board**> const&)
 0.00      8.98     0.00       24     0.00     0.00  std::__deque_buf_size(unsigned long)
 0.00      8.98     0.00        6     0.00     0.00  std::_Deque_iterator<State*, State*&, State**>::_Deque_iterator(std::_Deque_iterator<State*, State*&, State**> const&)


16x16 hard 2

Command line: ./sudoku src/16x16/hard2.txt 16

   Before :                                                  After :
   00 00 00 13 | 15 00 00 07 | 08 11 00 16 | 00 00 00 00     10 03 14 13 | 15 12 05 07 | 08 11 09 16 | 01 04 02 06  
   00 00 00 00 | 00 01 14 00 | 00 00 00 02 | 00 00 05 00     07 15 11 16 | 03 01 14 13 | 10 06 04 02 | 12 08 05 09  
   00 00 08 00 | 10 00 00 00 | 00 00 15 12 | 00 00 03 14     09 06 08 01 | 10 11 02 04 | 05 07 15 12 | 13 16 03 14  
   00 00 00 00 | 06 00 00 09 | 01 00 13 00 | 00 10 07 00     04 05 02 12 | 06 16 08 09 | 01 14 13 03 | 15 10 07 11  
   -----------------------------------------------------     -----------------------------------------------------
   00 07 05 00 | 12 10 00 08 | 00 00 06 00 | 00 13 00 00     03 07 05 02 | 12 10 11 08 | 14 04 06 15 | 16 13 09 01  
   00 00 00 11 | 00 00 00 00 | 07 00 00 00 | 00 15 00 03     12 08 01 11 | 14 13 16 06 | 07 02 05 09 | 04 15 10 03  
   00 00 06 04 | 00 15 09 00 | 12 03 11 00 | 00 00 00 07     16 13 06 04 | 02 15 09 01 | 12 03 11 10 | 05 14 08 07  
   14 00 00 15 | 00 00 00 00 | 00 00 00 13 | 06 00 00 00     14 09 10 15 | 05 04 07 03 | 16 01 08 13 | 06 12 11 02  
   -----------------------------------------------------     -----------------------------------------------------
   00 04 00 09 | 07 00 00 00 | 15 00 00 00 | 00 11 00 10     02 04 13 09 | 07 14 12 05 | 15 16 03 06 | 08 11 01 10  
   01 14 00 00 | 00 00 03 00 | 00 09 07 00 | 00 00 13 04     01 14 12 10 | 16 06 03 15 | 11 09 07 08 | 02 05 13 04  
   05 00 00 00 | 00 08 00 11 | 04 10 00 00 | 14 00 00 15     05 16 03 06 | 09 08 13 11 | 04 10 02 01 | 14 07 12 15  
   00 00 15 07 | 00 02 00 00 | 00 00 14 00 | 00 00 06 00     08 11 15 07 | 04 02 01 10 | 13 12 14 05 | 09 03 06 16  
   -----------------------------------------------------     -----------------------------------------------------
   00 10 09 00 | 00 00 00 14 | 00 13 00 00 | 00 00 00 08     11 10 09 05 | 01 07 15 14 | 02 13 12 04 | 03 06 16 08  
   00 02 00 08 | 11 00 04 00 | 03 00 16 00 | 00 00 00 00     06 02 07 08 | 11 09 04 12 | 03 05 16 14 | 10 01 15 13  
   13 00 00 00 | 00 05 00 16 | 00 15 00 07 | 11 00 14 12     13 01 04 03 | 08 05 06 16 | 09 15 10 07 | 11 02 14 12  
   00 12 00 00 | 13 03 10 00 | 06 08 00 11 | 00 00 00 00     15 12 16 14 | 13 03 10 02 | 06 08 01 11 | 07 09 04 05  

Link to full profile : FULL

Flat Profile :

 Each sample counts as 0.01 seconds.
 %   cumulative   self              self     total          
time   seconds   seconds    calls   s/call   s/call  name    
63.01      4.68     4.68   347198     0.00     0.00  create_copy_board(Board*)
24.77      6.52     1.84   347184     0.00     0.00  choose_cell_bf(Board*, int&, int&)
 7.54      7.08     0.56   347199     0.00     0.00  Board::Board(int)
 4.04      7.38     0.30   347184     0.00     0.00  Board::~Board()
 0.54      7.42     0.04  1041882     0.00     0.00  clear_number(Board*, int, Align, int)
 0.13      7.43     0.01   347294     0.00     0.00  update_solution(Board*, int, int, int)
 0.00      7.43     0.00  1041597     0.00     0.00  Board* const& std::forward<Board* const&>(std::remove_reference<Board* const&>::type&)
 0.00      7.43     0.00   347199     0.00     0.00  _ZN9__gnu_cxx13new_allocatorIP5BoardE9constructIS2_IRKS2_EEEvPT_DpOT0_
 0.00      7.43     0.00   347199     0.00     0.00  std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__construct_helper<Board*, Board* const&>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&)
 0.00      7.43     0.00   347199     0.00     0.00  decltype (_S_construct({parm#1}, {parm#2}, (forward<Board* const&>)({parm#3}))) std::allocator_traits<std::allocator<Board*> >::construct<Board*, Board* const&>(std::allocator<Board*>&, Board**, Board* const&)
 0.00      7.43     0.00   347199     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::push_back(Board* const&)
 0.00      7.43     0.00   347199     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::push(Board* const&)
 0.00      7.43     0.00   347199     0.00     0.00  operator new(unsigned long, void*)
 0.00      7.43     0.00   347191     0.00     0.00  std::_Deque_iterator<Board*, Board*&, Board**>::_Deque_iterator(std::_Deque_iterator<Board*, Board*&, Board**> const&)
 0.00      7.43     0.00   347187     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::end()
 0.00      7.43     0.00   347185     0.00     0.00  void __gnu_cxx::new_allocator<Board*>::destroy<Board*>(Board**)
 0.00      7.43     0.00   347185     0.00     0.00  std::_Deque_iterator<Board*, Board*&, Board**>::operator*() const
 0.00      7.43     0.00   347185     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::empty() const
 0.00      7.43     0.00   347185     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::empty() const
 0.00      7.43     0.00   347185     0.00     0.00  std::_Deque_iterator<Board*, Board*&, Board**>::operator--()
 0.00      7.43     0.00   347185     0.00     0.00  std::enable_if<std::__and_<std::allocator_traits<std::allocator<Board*> >::__destroy_helper<Board*>::type>::value, void>::type std::allocator_traits<std::allocator<Board*> >::_S_destroy<Board*>(std::allocator<Board*>&, Board**)
 0.00      7.43     0.00   347185     0.00     0.00  void std::allocator_traits<std::allocator<Board*> >::destroy<Board*>(std::allocator<Board*>&, Board**)
 0.00      7.43     0.00   347185     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::back()
 0.00      7.43     0.00   347185     0.00     0.00  std::deque<Board*, std::allocator<Board*> >::pop_back()
 0.00      7.43     0.00   347185     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::pop()
 0.00      7.43     0.00   347185     0.00     0.00  std::stack<Board*, std::deque<Board*, std::allocator<Board*> > >::top()
 0.00      7.43     0.00   347185     0.00     0.00  bool std::operator==<Board*, Board*&, Board**>(std::_Deque_iterator<Board*, Board*&, Board**> const&, std::_Deque_iterator<Board*, Board*&, Board**> const&)
 0.00      7.43     0.00   347184     0.00     0.00  update_stack(Board*)
 0.00      7.43     0.00       24     0.00     0.00  std::__deque_buf_size(unsigned long)

Analysis / Code Snippet

The two functions which dominate the runtime of this program are create_copy_board which accounts for roughly 60% or more of the exceution time and choose_cell_bf which accounts for another 20% generally.

Board* create_copy_board(Board* b){
  Board* copy_board = new Board(b->dim);
	for(int row = 0; row < b->dim; row++) {
		for(int col = 0; col < b->dim; col++) {
			copy_board->solution[row][col] = b->solution[row][col];
			for(int index = 0; index < b->dim; index++) {
				copy_board->cells[row][col][index] = b->cells[row][col][index];
			}
		}
	}
  copy_board->cells_solved = b->cells_solved;
	return copy_board;
}

/* Choose cell with the least number of possible values
 * Return 0 if that cell has 0 values
 */
bool choose_cell_bf(Board* b, int &row, int &col) {
  int least = b->dim+1, counter;
  for(int r = 0; r < b->dim; r++) {
    for(int c = 0; c < b->dim; c++) {
      counter = 0;
      if(!b->solution[r][c]) {
        for(int num = 0; num < b->dim; num++) {
          if(b->cells[r][c][num]) {
            counter++;
          }
        }
        if(counter < least) {
          least = counter;
          row = r;
          col = c;
        }
      }
    }
  }
  return (least != 0);
}

ColorTransfer by Jadrian S

The Source code can be found here This program is an implementation of image color transfer detailed by Erik Reinhard and coded by Tatsy it use OpenCV to do so. This program takes 2 images the first image denoted by the args is the image being editted with colours. The second image is an image that the program will take the colours from. After the color transfer an image is generated with the color transfer completed and is opened for the user to view.


Creating and Running the Project with gprof profiling

Perform the following steps in a Linux/Unix System:

STEP 1 - Download the following zips and extract their contents as they will be used for later steps

-> test files for opencv:Download    OR    File:Test.zip
-> ColorTransfer:Download    OR    File:ColorTransfer.zip

STEP 2 - Install all the required entries(make,cmake,git,gprof,etc) with the following terminal command:

-> sudo apt-get install build-essential cmake git libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev libswscale-dev binutils

STEP 3 - Download and Install OpenCV from their official page located here. You can do this with the following steps and commands:

-> Download the latest source data of OpenCV for Linux/Mac
-> Extract the zipped source data into its appropriately named folder opencv-(latest version)
-> Open terminal and enter us the cd command to enter the extracted folder opencv-(latest version)
-> Create a build directy with the command mkdir build
-> Use the cd command to enter the build folder
-> Use the following cmake command to generate the appropriate files for the make command: cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr/local ..
-> Use make with the -j7 command to allow for up to 7 jobs when preparing the install: make -j7
-> Install with: sudo make install
-> OpenCV should be installed

STEP 4 - Use the Test Folder provided by the dropbox link above to see if OpenCV is working correctly. You can follow these steps to do so

-> In terminal cd into the test folder
-> cmake .
-> make
-> run display image with the follow command ./DisplayImage ahri.jpg
-> If the image provided is displayed OpenCV should be in working order.

STEP 5 - Build the ColorTransfer program with the folder provided by the dropbox link and create a flat profile using gprof

-> Create the files for make using the following command: cmake -DCMAKE_CXX_FLAGS=-pg -DCMAKE_EXE_LINKER_FLAGS=-pg -DCMAKE_SHARED_LINKER_FLAGS=-pg .
-> The additional options include -pg within the cmake so that the appropriate gmon.out can be created for gprof
-> After the files are generated use the make command
-> Run the created program with 2 arguments arg1 being the image beind editted and arg2 being the image to take colours from (two images are provided for your convenience).
-> If using the provided image the command should like: ./ColorTransfer ahri.jpg ahri2.jpg
-> This should generate the colour transferred output image as well as the appropriate gmon.out file
-> Use the either of the following commands for the flat profile: gprof -p -b ./ColorTransfer > ColorTransfer.flt  OR  gprof test_gprof gmon.out > analysis.txt

Example Output

If using the examples provided you should have the following image results:

Input 1

Ahri.jpg

Input 2

Ahri2.jpg

Output

Ahri3.jpg

Profile

Flat profile:

Each sample counts as 0.01 seconds.

 %   cumulative   self              self     total           
time   seconds   seconds    calls  ms/call  ms/call  name    
36.65      0.37     0.37 187749717     0.00     0.00  Color3d::operator()(int)
22.78      0.60     0.23  6771110     0.00     0.00  operator*(cv::Mat const&, Color3d&)
13.37      0.74     0.14 60939990     0.00     0.00  double const& cv::Mat::at<double>(int, int) const
11.89      0.86     0.12                             main
 4.95      0.91     0.05 14413379     0.00     0.00  Color3d::operator=(Color3d const&)
 3.96      0.95     0.04 13156175     0.00     0.00  Color3d& cv::Mat::at<Color3d>(int, int)
 1.98      0.97     0.02  7543200     0.00     0.00  double& cv::Mat::at<double>(int, int)
 0.99      0.98     0.01 13156184     0.00     0.00  Color3d::Color3d()
 0.99      0.99     0.01  2128357     0.00     0.00  Color3d::operator*(Color3d const&)
 0.99      1.00     0.01                             frame_dummy
 0.50      1.00     0.01  4256710     0.00     0.00  Color3d::operator+(Color3d const&)
 0.50      1.01     0.01        1     5.00     5.00  cvflann::anyimpl::big_any_policy<cvflann::anyimpl::empty_any>::big_any_policy()
 0.50      1.01     0.01                             Color3d::Color3d(Color3d const&)
 0.00      1.01     0.00       44     0.00     0.00  cv::Mat::release()
 0.00      1.01     0.00       17     0.00     0.00  cv::_InputArray::init(int, void const*)
 0.00      1.01     0.00       14     0.00     0.00  cv::Mat::~Mat()
 0.00      1.01     0.00       11     0.00     0.00  cv::_InputArray::~_InputArray()
 0.00      1.01     0.00       11     0.00     0.00  cv::Size_<int>::Size_()
 0.00      1.01     0.00        9     0.00     0.00  cvflann::anyimpl::base_any_policy::base_any_policy()
 0.00      1.01     0.00        9     0.00     0.00  cvflann::anyimpl::base_any_policy::~base_any_policy()
 0.00      1.01     0.00        6     0.00     0.00  cv::_InputArray::_InputArray()
 0.00      1.01     0.00        6     0.00     0.00  cv::_OutputArray::_OutputArray(cv::Mat&)
 0.00      1.01     0.00        6     0.00     0.00  cv::_OutputArray::~_OutputArray()
 0.00      1.01     0.00        6     0.00     0.00  cv::MatSize::MatSize(int*)
 0.00      1.01     0.00        6     0.00     0.00  cv::MatStep::MatStep()
 0.00      1.01     0.00        5     0.00     0.00  cv::_InputArray::_InputArray(cv::Mat const&)
 0.00      1.01     0.00        5     0.00     0.00  cv::String::String(char const*)
 0.00      1.01     0.00        5     0.00     0.00  cv::String::~String()
 0.00      1.01     0.00        4     0.00     0.00  cv::Mat::create(int, int, int)
 0.00      1.01     0.00        4     0.00     0.00  cv::Mat::Mat(int, int, int)
 0.00      1.01     0.00        4     0.00     0.00  Color3d::divide(double)
 0.00      1.01     0.00        4     0.00     0.00  Color3d::Color3d(double, double, double)
 0.00      1.01     0.00        2     0.00     0.00  cv::Mat::Mat()
 0.00      1.01     0.00        2     0.00     0.00  cv::MatExpr::~MatExpr()
 0.00      1.01     0.00        2     0.00     0.00  Color3d::operator-(Color3d const&)
 0.00      1.01     0.00        2     0.00     0.00  cv::Mat::empty() const
 0.00      1.01     0.00        2     0.00     0.00  cv::Mat::total() const
 0.00      1.01     0.00        2     0.00     0.00  cv::MatExpr::operator cv::Mat() const
 0.00      1.01     0.00        1     0.00     5.00  _GLOBAL__sub_I__ZmlRKN2cv3MatER7Color3d
 0.00      1.01     0.00        1     0.00     5.00  __static_initialization_and_destruction_0(int, int)
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::big_any_policy<cv::String>::big_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::big_any_policy<cvflann::flann_algorithm_t>::big_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::big_any_policy<cvflann::flann_centers_init_t>::big_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::small_any_policy<char const*>::small_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::small_any_policy<bool>::small_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::small_any_policy<float>::small_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::small_any_policy<int>::small_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::small_any_policy<unsigned int>::small_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<cv::String>::typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<cv::String>::~typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<cvflann::anyimpl::empty_any>::typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<cvflann::anyimpl::empty_any>::~typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<cvflann::flann_algorithm_t>::typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<cvflann::flann_algorithm_t>::~typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<cvflann::flann_centers_init_t>::typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<cvflann::flann_centers_init_t>::~typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<char const*>::typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<char const*>::~typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<bool>::typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<bool>::~typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<float>::typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<float>::~typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<int>::typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<int>::~typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<unsigned int>::typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  cvflann::anyimpl::typed_base_any_policy<unsigned int>::~typed_base_any_policy()
 0.00      1.01     0.00        1     0.00     0.00  __gnu_cxx::new_allocator<int>::new_allocator()
 0.00      1.01     0.00        1     0.00     0.00  __gnu_cxx::new_allocator<int>::~new_allocator()
 0.00      1.01     0.00        1     0.00     0.00  std::allocator<int>::allocator()
 0.00      1.01     0.00        1     0.00     0.00  std::allocator<int>::~allocator()
 0.00      1.01     0.00        1     0.00     0.00  void std::_Destroy_aux<true>::__destroy<int*>(int*, int*)
 0.00      1.01     0.00        1     0.00     0.00  std::_Vector_base<int, std::allocator<int> >::_Vector_impl::_Vector_impl()
 0.00      1.01     0.00        1     0.00     0.00  std::_Vector_base<int, std::allocator<int> >::_Vector_impl::~_Vector_impl()
 0.00      1.01     0.00        1     0.00     0.00  std::_Vector_base<int, std::allocator<int> >::_M_deallocate(int*, unsigned long)
 0.00      1.01     0.00        1     0.00     0.00  std::_Vector_base<int, std::allocator<int> >::_M_get_Tp_allocator()
 0.00      1.01     0.00        1     0.00     0.00  std::_Vector_base<int, std::allocator<int> >::_Vector_base()
 0.00      1.01     0.00        1     0.00     0.00  std::_Vector_base<int, std::allocator<int> >::~_Vector_base()
 0.00      1.01     0.00        1     0.00     0.00  std::vector<int, std::allocator<int> >::vector()
 0.00      1.01     0.00        1     0.00     0.00  std::vector<int, std::allocator<int> >::~vector()
 0.00      1.01     0.00        1     0.00     0.00  void std::_Destroy<int*>(int*, int*)
 0.00      1.01     0.00        1     0.00     0.00  void std::_Destroy<int*, int>(int*, int*, std::allocator<int>&)

Call Graph

Media:callgraph.pdf

Full Profile Data File

Since the profile data is too large a download of it in pdf format will also be included.

Download: Media:profile-data.pdf

Analysis

http://pastebin.com/ZS7x0Uvq

As seen in the call graph of the profile data 98% of the time spent in the program is spent processing in the main file. Within this main multiple calls to the Color3d overloaded operator is made (located in the Color3d file). This makes the program ideal for parallelization as many of the steps used to process the pixel colours can be delegated to the Nvidia GPU. This can be seen in the above code snippets (found in the link above) when calls to Color3d operator are done multiple times in a triple for loop.


Sorted array processing by Abbas Zoeb

Sorted array processing finds the processing time it takes for an array before and after it has been sorted. This code basically allocates an array with random numbers (array size is 32768). Then the array is sorted using the standard library inbuilt sorting algorithm (which most likely is using Quick Sort). Then another loop is run through the entire array to find all the elements that are above the number '127'. This loop is inside another loop that rotates for another 100,000 times. The loop would go by faster as the array is already sorted and would be much easier now to find elements greater than 127. Profiling was performed on this c++ code on matrix. The project can be found here: [1]

Running the Program

The file can be simply compiled using g++ or clang++ on matrix.

Steps to compile the code and do the profiling on it:

0) Copy the code from GitHub and paste it into a cpp file (I named it sorted.cpp)

1) g++ -Wall -pg sorted.cpp -o sorted

2) ./sorted

3) gprof sorted gmon.out > prof_output

4) cat prof_output

Now you can see the flat and graph profiling for the code.

The Code

 
#include <algorithm>  
#include <ctime>  
#include <iostream>  

int main()
{
    // Generate data
    const unsigned arraySize = 32768;
    int data[arraySize];

    for (unsigned c = 0; c < arraySize; ++c)
        data[c] = std::rand() % 256;

    // The next loop runs faster because we are using a sorted array
    std::sort(data, data + arraySize);

    // Test
    clock_t start = clock();
    long long sum = 0;

    for (unsigned i = 0; i < 100000; ++i)
    {
        // Primary loop
        for (unsigned c = 0; c < arraySize; ++c)
        {
            if (data[c] >= 128)
                sum += data[c];
        }
    }

    double elapsedTime = static_cast<double>(clock() - start) / CLOCKS_PER_SEC;

    std::cout << elapsedTime << std::endl;
    std::cout << "sum = " << sum << std::endl;
}

Analysis

Here we can see the profiling as below:

Flat profile:

Each sample counts as 0.01 seconds.

 %   cumulative   self              self     total           
time   seconds   seconds    calls  ms/call  ms/call  name    
99.93     14.89    14.89                             main
 0.07     14.90     0.01   120348     0.00     0.00  void std::__iter_swap<true>::iter_swap<int*, int*>(int*, int*)
 0.00     14.90     0.00   120348     0.00     0.00  void std::swap<int>(int&, int&)
 0.00     14.90     0.00   120348     0.00     0.00  void std::iter_swap<int*, int*>(int*, int*)
 0.00     14.90     0.00    32767     0.00     0.00  void std::__unguarded_linear_insert<int*>(int*)
 0.00     14.90     0.00     2963     0.00     0.00  void std::__move_median_first<int*>(int*, int*, int*)
 0.00     14.90     0.00     2963     0.00     0.00  int* std::__unguarded_partition<int*, int>(int*, int*, int const&)
 0.00     14.90     0.00     2963     0.00     0.00  int* std::__unguarded_partition_pivot<int*>(int*, int*)
 0.00     14.90     0.00        1     0.00     0.00  _GLOBAL__sub_I_main
 0.00     14.90     0.00        1     0.00     0.00  __static_initialization_and_destruction_0(int, int)
 0.00     14.90     0.00        1     0.00     0.00  void std::__insertion_sort<int*>(int*, int*)
 0.00     14.90     0.00        1     0.00    10.00  void std::__introsort_loop<int*, int>(int*, int*, int)
 0.00     14.90     0.00        1     0.00     0.00  void std::__final_insertion_sort<int*>(int*, int*)
 0.00     14.90     0.00        1     0.00     0.00  void std::__unguarded_insertion_sort<int*>(int*, int*)
 0.00     14.90     0.00        1     0.00     0.00  std::__lg(int)
 0.00     14.90     0.00        1     0.00    10.00  void std::sort<int*>(int*, int*)

Full profiling can be found here - http://pastebin.com/vByZLWtd


Index by function name

 [14] _GLOBAL__sub_I_main     [7] int* std::__unguarded_partition<int*, int>(int*, int*, int const&) [6] void std::sort<int*>(int*, int*)
 [15] __static_initialization_and_destruction_0(int, int) [17] void std::__final_insertion_sort<int*>(int*, int*) [12] void std::swap<int>(int&, int&)
  [2] void std::__iter_swap<true>::iter_swap<int*, int*>(int*, int*) [13] void std::__unguarded_linear_insert<int*>(int*) [3] void std::iter_swap<int*, int*>(int*, int*)
 [16] void std::__insertion_sort<int*>(int*, int*) [18] void std::__unguarded_insertion_sort<int*>(int*, int*) [1] main
  [5] void std::__introsort_loop<int*, int>(int*, int*, int) [4] int* std::__unguarded_partition_pivot<int*>(int*, int*)
  [8] void std::__move_median_first<int*>(int*, int*, int*) [19] std::__lg(int)


      • The compilation of the code takes roughly 14 seconds and when the code is profiled we can see that the main uses most of the % processing time. i.e., 99.93%.


Final Statement

We plan to parallelize the "ColorTransfer" program as it yields better theoretical speedup of the execution of the whole task in comparison to Sudoku and Sorted Array Processing.

Assignment 2

Source : A2

Description

Removing CPU Bottleneck

The CPU bottleneck in the ColorTransfer/main.cpp:

// Multiplication of matrix and vector
Color3d operator *(const cv::Mat& M, Color3d& v) {
	Color3d u = Color3d();
	for(int i=0; i<3; i++) {
		u(i) = 0.0;
		for(int j=0; j<3; j++) {
			u(i) += M.at<double>(i, j) * v(j);
		}
	}
	return u;
}

Added functions and changes

To fix this issue We wrote a device function which handles a matrix by vector multiplication. We also wrote a kernel which has equal logic to the host version so we could off load all the needed data to the kernel without needing to do multiple back and forth(s). We also wrote a helper function which will allocate all transfer all the opencv matrices to a suitable form for the kernel to deal with.

Matrix by vector

__device__ void matvec(float* d_A, float* d_B, float* d_C)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    float sum = 0;
    if (tid < 3)
    {
        for (int i = 0; i < 3; ++i)
        {
            sum += d_A[i] * d_B[(i * 3) + tid];
        }

        d_C[tid] = sum;
    }
}

Kernel

__global__ void matvec_kernel(float* d_A, float* d_RGB2, float* d_LMS2, float* d_C,
    const int n, int targetrows, int targetcols, float* d_Tar)
{
    const double eps = 1.0e-4;
    for (int y = 0; y < targetrows; ++y) {
        for (int x = 0; x < targetcols; ++x) {
            memcpy(&d_A, &d_Tar[y * 3 + x], 3 * sizeof(float));

            matvec(d_A, d_RGB2, d_C);
            memcpy(&d_A, d_C, 3 * sizeof(float));

            for (int c = 0; c < 3; c++)
                d_A[c] = d_A[c] > -5.0 ? pow((double)10.0, (double)d_A[c]) : eps;

            matvec(d_A, d_LMS2, d_C);
            memcpy(&d_Tar[y * 3 + x], d_C, 3 * sizeof(float));
        }
    }
}

Helper

inline void vecTransfer(float* h, Color3d* v)
{
    for (int j = 0; j < 3; ++j)
        h[j] = v->v[j];
}

//KERNEL Helper function does setup and launch
void matvec_L(cv::Mat* mRGB2LMS, cv::Mat* mLMS2lab, float* h_C, int tarrow, int tarcol, float* h_Tar)
{
    float *h_A, *h_RGB2, *h_LMS2, *d_Tar;
    float *d_A, *d_RGB2, *d_LMS2, *d_C;

    int N = 3;

    h_A = (float*)malloc(sizeof(float) * N);
    h_RGB2 = new float[mRGB2LMS->total()];
    h_LMS2 = new float[mLMS2lab->total()];
    //h_C = (float*)malloc(sizeof(float) * N);

    cudaMalloc((void**)&d_A, sizeof(float) * N);
    cudaMalloc((void**)&d_RGB2, sizeof(float) * N * N);
    cudaMalloc((void**)&d_LMS2, sizeof(float) * N * N);
    cudaMalloc((void**)&d_C, sizeof(h_C));
    cudaMalloc((void**)&d_Tar, sizeof(h_Tar));

    Color3d vec;

    //copy vec and matrix to host pointers
    vecTransfer(h_A, &vec);
    memcpy(h_RGB2, mRGB2LMS->data, mRGB2LMS->total());
    memcpy(h_LMS2, mLMS2lab->data, mLMS2lab->total());

    cudaMemcpy(d_A, h_A, sizeof(float) * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_RGB2, h_RGB2, sizeof(float) * N * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_LMS2, h_LMS2, sizeof(float) * N * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_Tar, h_Tar, sizeof(h_Tar), cudaMemcpyHostToDevice);

    matvec_kernel << <N / BLOCK_SIZE + 1, BLOCK_SIZE >> >(d_A, d_RGB2, d_LMS2, d_C, N, tarrow, tarcol, d_Tar);
    //printf("error code: %s\n",cudaGetErrorString(cudaGetLastError()));

    cudaMemcpy(h_C, d_C, sizeof(h_C), cudaMemcpyDeviceToHost);

    free(h_A);
    free(h_RGB2);
    free(h_LMS2);
    //free(h_C);

    cudaFree(d_A);
    cudaFree(d_RGB2);
    cudaFree(d_LMS2);
    cudaFree(d_C);
    cudaFree(d_Tar);
}


Changes to main loop

Old

for (int y = 0; y < target.rows; y++) {
    for (int x = 0; x < target.cols; x++) {
      v = target.at<Color3d>(y, x);

      matvec_L(&v, &mlab2LMS, h_C);
      memcpy(&v, h_C, N * sizeof(float));

      for (int c = 0; c < 3; c++)
      v(c) = v(c) > -5.0 ? pow(10.0, v(c)) : eps;

      matvec_L(&v, &mLMS2RGB, h_C);
      memcpy(&target.at<Color3d>(y, x), h_C, N * sizeof(float));
   }
}

New

    float* h_TARGET = (float *)malloc(sizeof(target.data));
    memcpy(h_TARGET, target.data, sizeof(target.data));
    matvec_L(&mlab2LMS, &mLMS2RGB, h_C, rows, cols, h_TARGET);

Results

For our profiling we had three test cases.

(Size is in pixels)
Small : 400x400 by 400x400
Medium : 1400x989 by 1215x717
Large : 7362x4858 by 6000x4032

A2timings.PNG

Assignment 3

Link to Original Unchanged Project and project used for a3

Download

The file contains the visual studio solutions used for Assignment 3 as well as visual studio solution version of the original code.

To run the location of the OPENCV directory must be applied to the following project properties:

1. Under c/c++ -> general -> additional include directories -> the path to opencv\..\..\include 2. Under linker -> general -> additional library directories -> the path to opencv\..\..\lib

After doing the above, build the solution and run the appropriate Release or Debug exe with the target and reference image as arguments.

A link the the corresponding xls file of the run time can be found here

What was done

At first the power function used was switched out with __pow in the kernel as the traditional pow function is more heavy of a function. But the results were very small causing a different from around 10-30 milliseconds. Afterwards the kernel was upgraded to implement grid and strides. In doing so instead of doing all the calculations on one thread, many of them were able to be calculated on a separate row. This made it so that for every pixel with a row one thread would be responsible for the colour shift.

Other implementations were made to transfer all the data necessary for calculations all at once from the beginning and then perform all calculations done by tatsy on the device side, but due to time constraints and the complication of the project we were unable to fully implement these changes. The code is however left (commented) in the included project download.

Optimized Kernel

__global__ void matvec_kernel(float* d_A, float* d_RGB2, float* d_LMS2, float* d_C,
	const int n, int targetrows, int targetcols, float* d_Tar)
{
	const double eps = 1.0e-4;
	//grid-stride loop
	for (int tid = threadIdx.x + blockIdx.x * blockDim.x;
		tid < targetrows;
		tid += blockDim.x * gridDim.x)
	{
		for (int x = 0; x < targetcols; ++x) {
			memcpy(&d_A, &d_Tar[tid * 3 + x], 3 * sizeof(float));

			matvec(d_A, d_RGB2, d_C);
			memcpy(&d_A, d_C, 3 * sizeof(float));

			for (int c = 0; c < 3; c++)
				d_A[c] = d_A[c] > -5.0 ? __powf(10.0f, d_A[c]) : eps;

			matvec(d_A, d_LMS2, d_C);
			memcpy(&d_Tar[tid * 3 + x], d_C, 3 * sizeof(float));
		}
	}
}

Results

A3timings.PNG


When running in release and comparing the results to the original unchanged project straight from tatsy we noticed that for very small images the cuda version is slightly slower. This is probably due to the actual conversion of colour on a smaller image to be much shorter, whereas the transferring of the data over to device memory itself may add additional time. Overall when comparing the optimized and unoptimized versions of the kernel there was a visible increase.