Difference between revisions of "TriForce"
(→Assignment 3) |
(→Assignment 3) |
||
Line 1,828: | Line 1,828: | ||
Used faster memory | Used faster memory | ||
-each thread now remembers which values it has seen in a boolean array | -each thread now remembers which values it has seen in a boolean array | ||
− | Reduced Thread Divergence | + | Reduced Thread Divergence/CGMA |
- values are only assigned to the grid after the kernel 'solves' the sudoku removing wait times for assigning to global memory | - values are only assigned to the grid after the kernel 'solves' the sudoku removing wait times for assigning to global memory | ||
- at value in kernel and rowHas, colHas, boxHas, updated in a single place, reducing wait time for updating | - at value in kernel and rowHas, colHas, boxHas, updated in a single place, reducing wait time for updating |
Revision as of 10:07, 22 March 2019
GPU610/DPS915 | Student List | Group and Project Index | Student Resources | Glossary
Contents
TriForce
Team Members
- David Ferri, Sudoku Solver
- Vincent Terpstra, Julia Sets
- Raymond Kiguru, EasyBMP
Progress
Assignment 1: Sudoku Solver
Sudoku Solver Profiling
Source code from: https://www.geeksforgeeks.org/sudoku-backtracking-7/
Original Code:
// A Backtracking program in C++ to solve Sudoku problem #include <stdio.h> // UNASSIGNED is used for empty cells in sudoku grid #define UNASSIGNED 0 // N is used for the size of Sudoku grid. Size will be NxN #define N 9 // This function finds an entry in grid that is still unassigned bool FindUnassignedLocation(int grid[N][N], int &row, int &col); // Checks whether it will be legal to assign num to the given row, col bool isSafe(int grid[N][N], int row, int col, int num); /* Takes a partially filled-in grid and attempts to assign values to all unassigned locations in such a way to meet the requirements for Sudoku solution (non-duplication across rows, columns, and boxes) */ bool SolveSudoku(int grid[N][N]) { int row, col; // If there is no unassigned location, we are done if (!FindUnassignedLocation(grid, row, col)) return true; // success! // consider digits 1 to 9 for (int num = 1; num <= 9; num++) { // if looks promising if (isSafe(grid, row, col, num)) { // make tentative assignment grid[row][col] = num; // return, if success, yay! if (SolveSudoku(grid)) return true; // failure, unmake & try again grid[row][col] = UNASSIGNED; } } return false; // this triggers backtracking } /* Searches the grid to find an entry that is still unassigned. If found, the reference parameters row, col will be set the location that is unassigned, and true is returned. If no unassigned entries remain, false is returned. */ bool FindUnassignedLocation(int grid[N][N], int &row, int &col) { for (row = 0; row < N; row++) for (col = 0; col < N; col++) if (grid[row][col] == UNASSIGNED) return true; return false; } /* Returns a boolean which indicates whether an assigned entry in the specified row matches the given number. */ bool UsedInRow(int grid[N][N], int row, int num) { for (int col = 0; col < N; col++) if (grid[row][col] == num) return true; return false; } /* Returns a boolean which indicates whether an assigned entry in the specified column matches the given number. */ bool UsedInCol(int grid[N][N], int col, int num) { for (int row = 0; row < N; row++) if (grid[row][col] == num) return true; return false; } /* Returns a boolean which indicates whether an assigned entry within the specified 3x3 box matches the given number. */ bool UsedInBox(int grid[N][N], int boxStartRow, int boxStartCol, int num) { for (int row = 0; row < 3; row++) for (int col = 0; col < 3; col++) if (grid[row+boxStartRow][col+boxStartCol] == num) return true; return false; } /* Returns a boolean which indicates whether it will be legal to assign num to the given row,col location. */ bool isSafe(int grid[N][N], int row, int col, int num) { /* Check if 'num' is not already placed in current row, current column and current 3x3 box */ return !UsedInRow(grid, row, num) && !UsedInCol(grid, col, num) && !UsedInBox(grid, row - row%3 , col - col%3, num)&& grid[row][col]==UNASSIGNED; } /* A utility function to print grid */ void printGrid(int grid[N][N]) { for (int row = 0; row < N; row++) { for (int col = 0; col < N; col++) printf("%2d", grid[row][col]); printf("\n"); } } /* Driver Program to test above functions */ int main() { // 0 means unassigned cells int grid[N][N] = {{3, 0, 6, 5, 0, 8, 4, 0, 0}, {5, 2, 0, 0, 0, 0, 0, 0, 0}, {0, 8, 7, 0, 0, 0, 0, 3, 1}, {0, 0, 3, 0, 1, 0, 0, 8, 0}, {9, 0, 0, 8, 6, 3, 0, 0, 5}, {0, 5, 0, 0, 9, 0, 6, 0, 0}, {1, 3, 0, 0, 0, 0, 2, 5, 0}, {0, 0, 0, 0, 0, 0, 0, 7, 4}, {0, 0, 5, 2, 0, 6, 3, 0, 0}}; if (SolveSudoku(grid) == true) printGrid(grid); else printf("No solution exists"); return 0; }
Modified Version for 16x16 Puzzle:
// A Backtracking program in C++ to solve Sudoku problem #include <stdio.h> // UNASSIGNED is used for empty cells in sudoku grid #define UNASSIGNED 0 // N is used for the size of Sudoku grid. Size will be NxN #define N 16 // This function finds an entry in grid that is still unassigned bool FindUnassignedLocation(int grid[N][N], int &row, int &col); // Checks whether it will be legal to assign num to the given row, col bool isSafe(int grid[N][N], int row, int col, int num); /* Takes a partially filled-in grid and attempts to assign values to all unassigned locations in such a way to meet the requirements for Sudoku solution (non-duplication across rows, columns, and boxes) */ bool SolveSudoku(int grid[N][N]) { int row, col; // If there is no unassigned location, we are done if (!FindUnassignedLocation(grid, row, col)) return true; // success! // consider digits 1 to 16 for (int num = 1; num <= 16; num++) { // if looks promising if (isSafe(grid, row, col, num)) { // make tentative assignment grid[row][col] = num; // return, if success, yay! if (SolveSudoku(grid)) return true; // failure, unmake & try again grid[row][col] = UNASSIGNED; } } return false; // this triggers backtracking } /* Searches the grid to find an entry that is still unassigned. If found, the reference parameters row, col will be set the location that is unassigned, and true is returned. If no unassigned entries remain, false is returned. */ bool FindUnassignedLocation(int grid[N][N], int &row, int &col) { for (row = 0; row < N; row++) for (col = 0; col < N; col++) if (grid[row][col] == UNASSIGNED) return true; return false; } /* Returns a boolean which indicates whether an assigned entry in the specified row matches the given number. */ bool UsedInRow(int grid[N][N], int row, int num) { for (int col = 0; col < N; col++) if (grid[row][col] == num) return true; return false; } /* Returns a boolean which indicates whether an assigned entry in the specified column matches the given number. */ bool UsedInCol(int grid[N][N], int col, int num) { for (int row = 0; row < N; row++) if (grid[row][col] == num) return true; return false; } /* Returns a boolean which indicates whether an assigned entry within the specified 4x4 box matches the given number. */ bool UsedInBox(int grid[N][N], int boxStartRow, int boxStartCol, int num) { for (int row = 0; row < 4; row++) for (int col = 0; col < 4; col++) if (grid[row+boxStartRow][col+boxStartCol] == num) return true; return false; } /* Returns a boolean which indicates whether it will be legal to assign num to the given row,col location. */ bool isSafe(int grid[N][N], int row, int col, int num) { /* Check if 'num' is not already placed in current row, current column and current 4x4 box */ return !UsedInRow(grid, row, num) && !UsedInCol(grid, col, num) && !UsedInBox(grid, row - row%4 , col - col%4, num)&& grid[row][col]==UNASSIGNED; } /* A utility function to print grid */ void printGrid(int grid[N][N]) { for (int row = 0; row < N; row++) { for (int col = 0; col < N; col++) printf("%2d", grid[row][col]); printf("\n"); } } /* Driver Program to test above functions */ int main() { //https://puzzlemadness.co.uk/16by16giantsudoku/ // 0 means unassigned cells int grid[N][N] = {{0, 8, 0, 0, 0, 0, 0, 3, 0, 0, 0, 10, 9, 7, 11, 0}, {0, 9, 15, 13, 0, 10, 0, 0, 2, 6, 8, 16, 0, 0, 0, 0}, {0, 0, 16, 0, 15, 0, 8, 0, 9, 0, 0, 0, 6, 0, 2, 0}, {1, 0, 2, 0, 9, 11, 4, 6, 15, 3, 5, 7, 0, 0, 12, 0}, {16, 6, 4, 0, 5, 2, 0, 0, 1, 0, 0, 0, 11, 0, 0, 12}, {5, 11, 0, 0, 0, 3, 0, 15, 0, 16, 0, 13, 0, 1, 0, 8}, {0, 0, 3, 0, 0, 6, 11, 14, 0, 5, 7, 0, 0, 9, 0, 0}, {0, 0, 0, 14, 8, 0, 10, 0, 0, 11, 12, 0, 0, 0, 0, 0}, {0, 7, 13, 0, 0, 0, 0, 12, 0, 8, 9, 0, 0, 0, 3, 0}, {0, 0, 11, 9, 0, 7, 0, 0, 0, 0, 0, 12, 0, 8, 16, 5}, {0, 0, 10, 0, 11, 13, 0, 0, 0, 0, 0, 3, 12, 0, 6, 0}, {0, 5, 0, 0, 10, 15, 0, 1, 7, 2, 0, 0, 14, 11, 0, 0}, {0, 0, 5, 0, 0, 12, 14, 0, 0, 10, 0, 0, 15, 0, 0, 4}, {9, 0, 14, 6, 0, 0, 1, 0, 16, 0, 2, 0, 3, 0, 13, 0}, {8, 13, 0, 4, 0, 0, 0, 0, 12, 7, 3, 0, 0, 6, 0, 0}, {0, 16, 12, 0, 0, 5, 0, 9, 0, 13, 14, 4, 1, 0, 0, 0}}; if (SolveSudoku(grid) == true) printGrid(grid); else printf("No solution exists"); return 0; }
Modified Version for 25x25 puzzle:
// A Backtracking program in C++ to solve Sudoku problem #include <stdio.h> // UNASSIGNED is used for empty cells in sudoku grid #define UNASSIGNED 0 // N is used for the size of Sudoku grid. Size will be NxN #define N 25 // This function finds an entry in grid that is still unassigned bool FindUnassignedLocation(int grid[N][N], int &row, int &col); // Checks whether it will be legal to assign num to the given row, col bool isSafe(int grid[N][N], int row, int col, int num); /* Takes a partially filled-in grid and attempts to assign values to all unassigned locations in such a way to meet the requirements for Sudoku solution (non-duplication across rows, columns, and boxes) */ bool SolveSudoku(int grid[N][N]) { int row, col; // If there is no unassigned location, we are done if (!FindUnassignedLocation(grid, row, col)) return true; // success! // consider digits 1 to 25 for (int num = 1; num <= 25; num++) { // if looks promising if (isSafe(grid, row, col, num)) { // make tentative assignment grid[row][col] = num; // return, if success, yay! if (SolveSudoku(grid)) return true; // failure, unmake & try again grid[row][col] = UNASSIGNED; } } return false; // this triggers backtracking } /* Searches the grid to find an entry that is still unassigned. If found, the reference parameters row, col will be set the location that is unassigned, and true is returned. If no unassigned entries remain, false is returned. */ bool FindUnassignedLocation(int grid[N][N], int &row, int &col) { for (row = 0; row < N; row++) for (col = 0; col < N; col++) if (grid[row][col] == UNASSIGNED) return true; return false; } /* Returns a boolean which indicates whether an assigned entry in the specified row matches the given number. */ bool UsedInRow(int grid[N][N], int row, int num) { for (int col = 0; col < N; col++) if (grid[row][col] == num) return true; return false; } /* Returns a boolean which indicates whether an assigned entry in the specified column matches the given number. */ bool UsedInCol(int grid[N][N], int col, int num) { for (int row = 0; row < N; row++) if (grid[row][col] == num) return true; return false; } /* Returns a boolean which indicates whether an assigned entry within the specified 5x5 box matches the given number. */ bool UsedInBox(int grid[N][N], int boxStartRow, int boxStartCol, int num) { for (int row = 0; row < 5; row++) for (int col = 0; col < 5; col++) if (grid[row+boxStartRow][col+boxStartCol] == num) return true; return false; } /* Returns a boolean which indicates whether it will be legal to assign num to the given row,col location. */ bool isSafe(int grid[N][N], int row, int col, int num) { /* Check if 'num' is not already placed in current row, current column and current 5x5 box */ return !UsedInRow(grid, row, num) && !UsedInCol(grid, col, num) && !UsedInBox(grid, row - row%5 , col - col%5, num)&& grid[row][col]==UNASSIGNED; } /* A utility function to print grid */ void printGrid(int grid[N][N]) { for (int row = 0; row < N; row++) { for (int col = 0; col < N; col++) printf("%2d", grid[row][col]); printf("\n"); } } /* Driver Program to test above functions */ int main() { //http://www.sudoku-download.net/sudoku_25x25.php // 0 means unassigned cells int grid[N][N] = {{1, 0, 4, 0, 25, 0, 19, 0, 0, 10, 21, 8, 0, 14, 0, 6, 12, 9, 0, 0, 0, 0, 0, 0, 5}, {5, 0, 19, 23, 24, 0, 22, 12, 0, 0, 16, 6, 0, 20, 0, 18, 0, 25, 14, 13, 10, 11, 0, 1, 15}, {0, 0, 0, 0, 0, 0, 21, 5, 0, 20, 11, 10, 0, 1, 0, 4, 8, 24, 23, 15, 18, 0, 16, 22, 19}, {0, 7, 21, 8, 18, 0, 0, 0, 11, 0, 5, 0, 0, 24, 0, 0, 0, 17, 22, 1, 9, 6, 25, 0, 0}, {0, 13, 15, 0, 22, 14, 0, 18, 0, 16, 0, 0, 0, 4, 0, 0, 0, 19, 0, 0, 0, 24, 20, 21, 17}, {12, 0, 11, 0, 6, 0, 0, 0, 0, 15, 0, 0, 0, 0, 21, 25, 19, 0, 4, 0, 22, 14, 0, 20, 0}, {8, 0, 0, 21, 0, 16, 0, 0, 0, 2, 0, 3, 0, 0, 0, 0, 17, 23, 18, 22, 0, 0, 0, 24, 6}, {4, 0, 14, 18, 7, 9, 0, 22, 21, 19, 0, 0, 0, 2, 0, 5, 0, 0, 0, 6, 16, 15, 0, 11, 12}, {22, 0, 24, 0, 23, 0, 0, 11, 0, 7, 0, 0, 4, 0, 14, 0, 2, 12, 0, 8, 5, 19, 0, 25, 9}, {20, 0, 0, 0, 5, 0, 0, 0, 0, 17, 9, 0, 12, 18, 0, 1, 0, 0, 7, 24, 0, 0, 0, 13, 4}, {13, 0, 0, 5, 0, 2, 23, 14, 4, 18, 22, 0, 17, 0, 0, 20, 0, 1, 9, 21, 12, 0, 0, 8, 11}, {14, 23, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 20, 25, 0, 3, 4, 13, 0, 11, 21, 9, 5, 18, 22}, {7, 0, 0, 11, 17, 20, 24, 0, 0, 0, 3, 4, 1, 12, 0, 0, 6, 14, 0, 5, 25, 13, 0, 0, 0}, {0, 0, 16, 9, 0, 17, 11, 7, 10, 25, 0, 0, 0, 13, 6, 0, 0, 18, 0, 0, 19, 4, 0, 0, 20}, {6, 15, 0, 19, 4, 13, 0, 0, 5, 0, 18, 11, 0, 0, 9, 8, 22, 16, 25, 10, 7, 0, 0, 0, 0}, {0, 0, 0, 2, 0, 0, 10, 19, 3, 0, 1, 0, 22, 9, 4, 11, 15, 0, 20, 0, 0, 8, 23, 0, 25}, {0, 24, 8, 13, 1, 0, 0, 4, 20, 0, 17, 14, 0, 0, 18, 0, 16, 22, 5, 0, 11, 0, 10, 0, 0}, {23, 10, 0, 0, 0, 0, 0, 0, 18, 0, 6, 0, 16, 0, 0, 17, 1, 0, 13, 0, 0, 3, 19, 12, 0}, {25, 5, 0, 14, 11, 0, 17, 0, 8, 24, 13, 0, 19, 23, 15, 9, 0, 0, 12, 0, 20, 0, 22, 0, 7}, {0, 0, 17, 4, 0, 22, 15, 0, 23, 11, 12, 25, 0, 0, 0, 0, 18, 8, 0, 7, 0, 0, 14, 0, 13}, {19, 6, 23, 22, 8, 0, 0, 1, 25, 4, 14, 2, 0, 3, 7, 13, 10, 11, 16, 0, 0, 0, 0, 0, 0}, {0, 4, 0, 17, 0, 3, 0, 24, 0, 8, 20, 23, 11, 10, 25, 22, 0, 0, 0, 12, 13, 2, 18, 6, 0}, {0, 0, 7, 16, 0, 0, 6, 17, 2, 21, 0, 18, 0, 0, 0, 19, 0, 0, 8, 0, 0, 0, 0, 4, 0}, {18, 9, 25, 1, 2, 11, 0, 0, 13, 22, 4, 0, 21, 0, 5, 0, 23, 7, 0, 0, 15, 0, 3, 0, 8}, {0, 21, 10, 0, 0, 12, 0, 20, 16, 0, 19, 0, 0, 0, 0, 15, 14, 4, 2, 18, 23, 25, 11, 7, 0}}; if (SolveSudoku(grid) == true) printGrid(grid); else printf("No solution exists"); return 0; }
Obtaining flat profiles and call graphs on matrix environment:
$ g++ sudokuC.cpp -std=c++0x -o Sudoku $ ./Sudoku 3 1 6 5 7 8 4 9 2 5 2 9 1 3 4 7 6 8 4 8 7 6 2 9 5 3 1 2 6 3 4 1 5 9 8 7 9 7 4 8 6 3 1 2 5 8 5 1 7 9 2 6 4 3 1 3 8 9 4 7 2 5 6 6 9 2 3 5 1 8 7 4 7 4 5 2 8 6 3 1 9 $ gprof -p -b ./Sudoku gmon.out > 9x9.flt $ gprof -q -b ./Sudoku gmon.out > 9x9.clg
$ g++ sudokuC16.cpp -std=c++0x -pg -o Sudoku16 $ ./Sudoku16 12 8 6 516 1 2 31314 410 9 71115 11 915131210 7 5 2 6 816 414 1 3 4 316 71514 813 91211 1 6 5 210 114 210 911 4 615 3 5 7 8131216 16 6 415 5 213 7 1 910 811 31412 511 9 2 4 312151416 613 7 110 8 1012 3 8 1 61114 4 5 7 216 91513 13 1 714 8 91016 3111215 2 4 5 6 2 71316 6 4 51211 8 9141015 3 1 6 411 914 7 3 210 1151213 816 5 141510 11113 9 8 5 416 312 2 6 7 3 5 812101516 1 7 213 61411 4 9 7 2 5 313121411 610 1 91516 8 4 91014 6 7 8 1 41615 2 5 3121311 813 1 4 216151012 7 311 5 6 914 15161211 3 5 6 9 81314 4 110 7 2 $ gprof -p -b ./Sudoku16 gmon.out > 16x16.flt $ gprof -q -b ./Sudoku16 gmon.out > 16x16.clg
$ g++ sudokuC25.cpp -std=c++0x -pg -o Sudoku25 $ ./Sudoku25 111 42025241915171021 8181422 612 9 316 2 71323 5 5 2192324 82212 9 316 6 7201718212514131011 4 115 1714 9 6 32521 5 7201110 2 113 4 82423151812162219 16 721 818 4 2131123 51915241210201722 1 9 62514 3 101315122214 118 61623 925 4 3 7 51911 2 824202117 12 11110 6 513232415 716 817212519 3 4 92214 22018 8191321 916 42512 215 3 511201417231822 110 724 6 4171418 7 9 322211925 124 223 5132010 61615 81112 22 3241523182011 1 71013 4 61416 21221 8 5191725 9 2016 225 510 8 61417 922121819 11115 724 3232113 4 1325 3 510 22314 418221517192420 7 1 9211216 6 811 1423 124121916 815 6 2 7202510 3 413171121 9 51822 7 818111720242122 9 3 4 11216 2 61419 52513151023 22216 9211711 71025 8 51413 6122418152319 4 1 320 6152019 41312 3 5 118112321 9 822162510 71724 214 211812 216 71019 313 12422 9 41115 6201417 823 525 924 813 1 625 420121714 3 718231622 51911211015 2 231022 71521 5 91814 62016 81117 1 21325 4 3191224 25 5 61411 117 2 8241321192315 9 31012 420182216 7 32017 4192215162311122510 5 22118 824 7 6 114 913 19 62322 81518 125 414 2 9 3 7131011162024 5121721 15 4 51714 3 72419 8202311102522 921 11213 218 616 1112 7162023 617 22124181315 11925 5 8 31422 9 410 18 925 1 21114101322 4122116 52423 7 6171520 319 8 242110 31312 92016 51917 622 81514 4 218232511 7 1 $ gprof -p -b ./Sudoku25 gmon.out > 25x25.flt $ gprof -q -b ./Sudoku25 gmon.out > 25x25.clg
For 9x9 Sudoku Puzzle (3x3 squares)
Flat profile: Each sample counts as 0.01 seconds. no time accumulated % cumulative self self total time seconds seconds calls Ts/call Ts/call name 0.00 0.00 0.00 6732 0.00 0.00 isSafe(int (*) [9], int, int, int) 0.00 0.00 0.00 6732 0.00 0.00 UsedInRow(int (*) [9], int, int) 0.00 0.00 0.00 2185 0.00 0.00 UsedInCol(int (*) [9], int, int) 0.00 0.00 0.00 1078 0.00 0.00 UsedInBox(int (*) [9], int, int, int) 0.00 0.00 0.00 770 0.00 0.00 FindUnassignedLocation(int (*) [9], int&, int&) 0.00 0.00 0.00 1 0.00 0.00 SolveSudoku(int (*) [9]) 0.00 0.00 0.00 1 0.00 0.00 printGrid(int (*) [9])
Call graph granularity: each sample hit covers 2 byte(s) no time propagated index % time self children called name 0.00 0.00 6732/6732 SolveSudoku(int (*) [9]) [13] [8] 0.0 0.00 0.00 6732 isSafe(int (*) [9], int, int, int) [8] 0.00 0.00 6732/6732 UsedInRow(int (*) [9], int, int) [9] 0.00 0.00 2185/2185 UsedInCol(int (*) [9], int, int) [10] 0.00 0.00 1078/1078 UsedInBox(int (*) [9], int, int, int) [11] ----------------------------------------------- 0.00 0.00 6732/6732 isSafe(int (*) [9], int, int, int) [8] [9] 0.0 0.00 0.00 6732 UsedInRow(int (*) [9], int, int) [9] ----------------------------------------------- 0.00 0.00 2185/2185 isSafe(int (*) [9], int, int, int) [8] [10] 0.0 0.00 0.00 2185 UsedInCol(int (*) [9], int, int) [10] ----------------------------------------------- 0.00 0.00 1078/1078 isSafe(int (*) [9], int, int, int) [8] [11] 0.0 0.00 0.00 1078 UsedInBox(int (*) [9], int, int, int) [11] ----------------------------------------------- 0.00 0.00 770/770 SolveSudoku(int (*) [9]) [13] [12] 0.0 0.00 0.00 770 FindUnassignedLocation(int (*) [9], int&, int&) [12] ----------------------------------------------- 769 SolveSudoku(int (*) [9]) [13] 0.00 0.00 1/1 main [6] [13] 0.0 0.00 0.00 1+769 SolveSudoku(int (*) [9]) [13] 0.00 0.00 6732/6732 isSafe(int (*) [9], int, int, int) [8] 0.00 0.00 770/770 FindUnassignedLocation(int (*) [9], int&, int&) [12] 769 SolveSudoku(int (*) [9]) [13] ----------------------------------------------- 0.00 0.00 1/1 main [6] [14] 0.0 0.00 0.00 1 printGrid(int (*) [9]) [14] ----------------------------------------------- Index by function name [13] SolveSudoku(int (*) [9]) [11] UsedInBox(int (*) [9], int, int, int) [14] printGrid(int (*) [9]) [12] FindUnassignedLocation(int (*) [9], int&, int&) [10] UsedInCol(int (*) [9], int, int) [8] isSafe(int (*) [9], int, int, int) [9] UsedInRow(int (*) [9], int, int)
For 16x16 Sudoku Puzzle (4x4 squares)
Puzzle from: [1]
Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls s/call s/call name 39.04 15.00 15.00 28071636 0.00 0.00 FindUnassignedLocation(int (*) [16], int&, int&) 36.19 28.90 13.90 449145092 0.00 0.00 UsedInRow(int (*) [16], int, int) 10.60 32.97 4.07 120354547 0.00 0.00 UsedInCol(int (*) [16], int, int) 4.97 34.88 1.91 41212484 0.00 0.00 UsedInBox(int (*) [16], int, int, int) 4.59 36.65 1.76 1 1.76 38.39 SolveSudoku(int (*) [16]) 4.55 38.39 1.75 449145092 0.00 0.00 isSafe(int (*) [16], int, int, int) 0.01 38.40 0.01 frame_dummy 0.00 38.40 0.00 1 0.00 0.00 printGrid(int (*) [16])
Call graph granularity: each sample hit covers 2 byte(s) for 0.03% of 36.85 seconds index % time self children called name <spontaneous> [1] 100.0 0.00 36.85 main [1] 1.93 34.93 1/1 SolveSudoku(int (*) [16]) [2] 0.00 0.00 1/1 printGrid(int (*) [16]) [14] ----------------------------------------------- 28071635 SolveSudoku(int (*) [16]) [2] 1.93 34.93 1/1 main [1] [2] 100.0 1.93 34.93 1+28071635 SolveSudoku(int (*) [16]) [2] 1.69 19.09 449145092/449145092 isSafe(int (*) [16], int, int, int) [3] 14.14 0.00 28071636/28071636 FindUnassignedLocation(int (*) [16], int&, int&) [4] 28071635 SolveSudoku(int (*) [16]) [2] ----------------------------------------------- 1.69 19.09 449145092/449145092 SolveSudoku(int (*) [16]) [2] [3] 56.4 1.69 19.09 449145092 isSafe(int (*) [16], int, int, int) [3] 13.58 0.00 449145092/449145092 UsedInRow(int (*) [16], int, int) [5] 3.54 0.00 120354547/120354547 UsedInCol(int (*) [16], int, int) [6] 1.98 0.00 41212484/41212484 UsedInBox(int (*) [16], int, int, int) [7] ----------------------------------------------- 14.14 0.00 28071636/28071636 SolveSudoku(int (*) [16]) [2] [4] 38.4 14.14 0.00 28071636 FindUnassignedLocation(int (*) [16], int&, int&) [4] ----------------------------------------------- 13.58 0.00 449145092/449145092 isSafe(int (*) [16], int, int, int) [3] [5] 36.8 13.58 0.00 449145092 UsedInRow(int (*) [16], int, int) [5] ----------------------------------------------- 3.54 0.00 120354547/120354547 isSafe(int (*) [16], int, int, int) [3] [6] 9.6 3.54 0.00 120354547 UsedInCol(int (*) [16], int, int) [6] ----------------------------------------------- 1.98 0.00 41212484/41212484 isSafe(int (*) [16], int, int, int) [3] [7] 5.4 1.98 0.00 41212484 UsedInBox(int (*) [16], int, int, int) [7] ----------------------------------------------- 0.00 0.00 1/1 main [1] [14] 0.0 0.00 0.00 1 printGrid(int (*) [16]) [14] ----------------------------------------------- Index by function name [2] SolveSudoku(int (*) [16]) [7] UsedInBox(int (*) [16], int, int, int) [14] printGrid(int (*) [16]) [4] FindUnassignedLocation(int (*) [16], int&, int&) [6] UsedInCol(int (*) [16], int, int) [3] isSafe(int (*) [16], int, int, int) [5] UsedInRow(int (*) [16], int, int)
For 25x25 Sudoku Puzzle (5x5 squares) Puzzle from: http://www.sudoku-download.net/sudoku_25x25.php
Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls Ks/call Ks/call name 48.76 1052.18 1052.18 425478951 0.00 0.00 UsedInRow(int (*) [25], int, int) 25.24 1596.81 544.63 876012758 0.00 0.00 FindUnassignedLocation(int (*) [25], int&, int&) 12.48 1866.03 269.21 590817023 0.00 0.00 UsedInCol(int (*) [25], int, int) 4.83 1970.24 104.21 425478951 0.00 0.00 isSafe(int (*) [25], int, int, int) 4.79 2073.51 103.27 1 0.10 2.17 SolveSudoku(int (*) [25]) 4.35 2167.39 93.89 1355081265 0.00 0.00 UsedInBox(int (*) [25], int, int, int) 0.01 2167.56 0.17 frame_dummy 0.00 2167.56 0.00 1 0.00 0.00 printGrid(int (*) [25])
Call graph granularity: each sample hit covers 2 byte(s) for 0.00% of 2085.44 seconds index % time self children called name <spontaneous> [1] 100.0 0.00 2085.30 main [1] 97.03 1988.27 1/1 SolveSudoku(int (*) [25]) [2] 0.00 0.00 1/1 printGrid(int (*) [25]) [14] ----------------------------------------------- 876012757 SolveSudoku(int (*) [25]) [2] 97.03 1988.27 1/1 main [1] [2] 100.0 97.03 1988.27 1+876012757 SolveSudoku(int (*) [25]) [2] 101.19 1361.55 425478951/425478951 isSafe(int (*) [25], int, int, int) [3] 525.53 0.00 876012758/876012758 FindUnassignedLocation(int (*) [25], int&, int&) [5] 876012757 SolveSudoku(int (*) [25]) [2] ----------------------------------------------- 101.19 1361.55 425478951/425478951 SolveSudoku(int (*) [25]) [2] [3] 70.1 101.19 1361.55 425478951 isSafe(int (*) [25], int, int, int) [3] 1011.03 0.00 425478951/425478951 UsedInRow(int (*) [25], int, int) [4] 259.56 0.00 590817023/590817023 UsedInCol(int (*) [25], int, int) [6] 90.96 0.00 1355081265/1355081265 UsedInBox(int (*) [25], int, int, int) [7] ----------------------------------------------- 1011.03 0.00 425478951/425478951 isSafe(int (*) [25], int, int, int) [3] [4] 48.5 1011.03 0.00 425478951 UsedInRow(int (*) [25], int, int) [4] ----------------------------------------------- 525.53 0.00 876012758/876012758 SolveSudoku(int (*) [25]) [2] [5] 25.2 525.53 0.00 876012758 FindUnassignedLocation(int (*) [25], int&, int&) [5] ----------------------------------------------- 259.56 0.00 590817023/590817023 isSafe(int (*) [25], int, int, int) [3] [6] 12.4 259.56 0.00 590817023 UsedInCol(int (*) [25], int, int) [6] ----------------------------------------------- 90.96 0.00 1355081265/1355081265 isSafe(int (*) [25], int, int, int) [3] [7] 4.4 90.96 0.00 1355081265 UsedInBox(int (*) [25], int, int, int) [7] ----------------------------------------------- <spontaneous> [8] 0.0 0.14 0.00 frame_dummy [8] ----------------------------------------------- 0.00 0.00 1/1 main [1] [14] 0.0 0.00 0.00 1 printGrid(int (*) [25]) [14] ----------------------------------------------- Index by function name [2] SolveSudoku(int (*) [25]) [7] UsedInBox(int (*) [25], int, int, int) [14] printGrid(int (*) [25]) [5] FindUnassignedLocation(int (*) [25], int&, int&) [6] UsedInCol(int (*) [25], int, int) [8] frame_dummy [3] isSafe(int (*) [25], int, int, int) [4] UsedInRow(int (*) [25], int, int)
Assignment 1: EasyBMP
EasyBMP Bitmap image library (Sample Program: Image to black and white renderer)
Library: http://easybmp.sourceforge.net/
Sample code: |
---|
/**/ #include "EasyBMP.h" using namespace std; int main(int argc, char* argv[]) { // Create a new Bitmap image with EasyBMP BMP Background; Background.ReadFromFile(argv[1]); BMP Output; int picWidth = Background.TellWidth(); int picHeight = Background.TellHeight(); Output.SetSize(Background.TellWidth(), Background.TellHeight()); Output.SetBitDepth(1); for (int i = 1; i < picWidth - 1; ++i) { for (int j = 1; j < picHeight - 1; ++j) { int col = (Background(i, j)->Blue + Background(i, j)->Green + 10 * Background(i, j)->Red) / 12; if (col > 127) { Output(i, j)->Red = 255; Output(i, j)->Blue = 255; Output(i, j)->Green = 255; } else { Output(i, j)->Red = 0; Output(i, j)->Blue = 0; Output(i, j)->Green = 0; } } } Output.WriteToFile(argv[2]); return 0; } /**/ |
The program was compiled using the following commands:
g++ -c -pg -g BW.cpp EasyBMP.cpp g++ -pg BW.o EasyBMP.o -o BW rm *.o
Attempted to run the program with a number of files (8K resolution):
Flat profile (Cabin): |
---|
Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls s/call s/call name 31.38 1.74 1.74 33177600 0.00 0.00 BMP::FindClosestColor(RGBApixel&) 23.52 3.04 1.30 198921624 0.00 0.00 BMP::operator()(int, int) 9.95 3.59 0.55 2 0.28 0.28 BMP::SetSize(int, int) 7.60 4.01 0.42 41663472 0.00 0.00 BMP::GetColor(int) 6.87 4.39 0.38 main 5.43 4.69 0.30 74841076 0.00 0.00 IntPow(int, int) 3.62 4.89 0.20 74841072 0.00 0.00 BMP::TellNumberOfColors() 3.35 5.07 0.19 4320 0.00 0.00 BMP::Write1bitRow(unsigned char*, int, int) 2.53 5.21 0.14 124990416 0.00 0.00 IntSquare(int) 2.17 5.33 0.12 4320 0.00 0.00 BMP::Read24bitRow(unsigned char*, int, int) 1.63 5.42 0.09 2 0.05 0.05 BMP::~BMP() 0.90 5.47 0.05 GetEasyBMPwarningState() 0.72 5.51 0.04 1 0.04 0.04 _GLOBAL__sub_I_EasyBMPwarnings 0.18 5.52 0.01 2 0.01 0.01 BMP::TellWidth() 0.18 5.53 0.01 1 0.01 2.99 BMP::WriteToFile(char const*) 0.00 5.53 0.00 16 0.00 0.00 SafeFread(char*, int, int, _IO_FILE*) 0.00 5.53 0.00 6 0.00 0.00 IsBigEndian() 0.00 5.53 0.00 2 0.00 0.00 EasyBMPcheckDataSize() 0.00 5.53 0.00 2 0.00 0.00 BMP::TellHeight() 0.00 5.53 0.00 2 0.00 0.00 BMP::SetBitDepth(int) 0.00 5.53 0.00 2 0.00 0.00 BMP::BMP() 0.00 5.53 0.00 2 0.00 0.00 BMFH::BMFH() 0.00 5.53 0.00 2 0.00 0.00 BMIH::BMIH() 0.00 5.53 0.00 1 0.00 0.00 _GLOBAL__sub_I_main 0.00 5.53 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) 0.00 5.53 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) 0.00 5.53 0.00 1 0.00 0.40 BMP::ReadFromFile(char const*) 0.00 5.53 0.00 1 0.00 0.00 BMP::CreateStandardColorTable() |
Flat profile (Lake): |
---|
Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls s/call s/call name 30.60 1.71 1.71 33177600 0.00 0.00 BMP::FindClosestColor(RGBApixel&) 21.12 2.89 1.18 198921624 0.00 0.00 BMP::operator()(int, int) 10.20 3.46 0.57 2 0.29 0.29 BMP::SetSize(int, int) 8.59 3.94 0.48 main 6.26 4.29 0.35 76982189 0.00 0.00 IntPow(int, int) 6.08 4.63 0.34 43804585 0.00 0.00 BMP::GetColor(int) 5.55 4.94 0.31 76982185 0.00 0.00 BMP::TellNumberOfColors() 3.76 5.15 0.21 4320 0.00 0.00 BMP::Write1bitRow(unsigned char*, int, int) 3.76 5.36 0.21 131413755 0.00 0.00 IntSquare(int) 2.15 5.48 0.12 4320 0.00 0.00 BMP::Read24bitRow(unsigned char*, int, int) 1.07 5.54 0.06 2 0.03 0.03 BMP::~BMP() 0.54 5.57 0.03 GetEasyBMPwarningState() 0.36 5.59 0.02 1 0.02 0.02 _GLOBAL__sub_I_EasyBMPwarnings 0.00 5.59 0.00 16 0.00 0.00 SafeFread(char*, int, int, _IO_FILE*) 0.00 5.59 0.00 6 0.00 0.00 IsBigEndian() 0.00 5.59 0.00 2 0.00 0.00 EasyBMPcheckDataSize() 0.00 5.59 0.00 2 0.00 0.00 BMP::TellHeight() 0.00 5.59 0.00 2 0.00 0.00 BMP::SetBitDepth(int) 0.00 5.59 0.00 2 0.00 0.00 BMP::TellWidth() 0.00 5.59 0.00 2 0.00 0.00 BMP::BMP() 0.00 5.59 0.00 2 0.00 0.00 BMFH::BMFH() 0.00 5.59 0.00 2 0.00 0.00 BMIH::BMIH() 0.00 5.59 0.00 1 0.00 0.00 _GLOBAL__sub_I_main 0.00 5.59 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) 0.00 5.59 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) 0.00 5.59 0.00 1 0.00 3.13 BMP::WriteToFile(char const*) 0.00 5.59 0.00 1 0.00 0.41 BMP::ReadFromFile(char const*) 0.00 5.59 0.00 1 0.00 0.00 BMP::CreateStandardColorTable() |
Assignment 1: Julia Sets
This portion of the assignment focuses on Julia sets with the quadratic formula:
fc(z) = z^2 + c; Where c and z are complex numbers
Psuedo code
for(Pixel pix in image){ pix.color = colorFunction(escapeValue(pix.loc, julia)); }
escapeValue(Complex loc, Complex julia){ int cycles = 0; while(|loc| <=2 && ++cycles < MAXCYCLES){ loc = loc * loc + julia; } return cycles; }
Julia.cpp |
---|
/** * Julia.cpp * Vincent Terpstra * Feb 18 / 2019 * Functions to display a Julia set */ #include <iostream> #include <math.h> #include "Bitmap.h" #define MAXCYCLES 1000 #define ITER 1 #define PI 3.14159 void calcJulia(int* array, int width, int height, float jR, float jI); void createBMP(int* array, int width, int height); int main(int size, const char ** args) { int height = (size > 1) ? atoi(args[1]) : 1000; int width = height / 2 * 3; int * array = new int[height * width]; //Initial Julia Point value float jI = -0.4f; float jR = 0.6f; //for rotating the Julia point around the origin float angle = 2.0f / ITER * PI; float cosA = cos(angle); float sinA = sin(angle); for(int i = 0; i < ITER; i++){ calcJulia(array, width, height, jI, jR); createBMP(array, width, height); //rotate the Julia point float tmp = jI; jI = cosA * jI - sinA * jR; jR = cosA * jR + sinA * tmp; } delete[] array; return 0; } /** * calcDepth * finds the escape value of the Julia set * inputs sR, sI: starting position on the map * jR, jI: values of the julia set * exits when |s| > 2 or reached MAXCYCLES * return the escape value or MAXCYCLES */ int calcDepth(float sR, float sI, const float jR, const float jI){ int cycle = 0; while(sR * sR + sI * sI <= 4 && ++cycle < MAXCYCLES){ // s(next) = s * s + j float tmp = sR; sR = sR * sR - sI * sI + jR; sI = 2 * tmp * sI + jI; } return cycle; } /** * calcJulia * Function to calculate the values from (-1.5, 1.5),(-1, 1) in the Julia set * saves the values onto the array */ void calcJulia(int * array, int width, int height, const float jR, const float jI){ float delta = 2 / (float)height; float x, y = -1; for(int i = 0; i < height; i++){ x = -1.5f; for(int j = 0; j < width; j++){ *array++ = calcDepth(x, y, jR, jI); x+=delta; } y += delta; } } struct Pix { float val; int r, g, b; Pix(float val, int r, int g, int b) : val(val), r(r), g(g), b(b) {} }; bool lerp(float clr, Pix& first, Pix& second, Bitmap & map); /** * createBMP * maps the values in the array to a .bmp file * linear interpolates between color values */ void createBMP(int * array, int width, int height){ static int idx = 0; char name[] = "julia000.bmp"; name[5] = (idx / 100) + '0'; name[6] = ((idx / 10) % 10) + '0'; name[7] = (idx % 10) + '0'; Bitmap map (name, width, height); //idx++; const int pix = width * height; struct Pix clr0(0, 20, 20, 30), clr1(10, 20, 10, 100), clr2(180, 20, 170, 205), clr3(MAXCYCLES, 0, 0, 0); const int * pMax = array + width * height; while(array < pMax){ float clr = *array++; bool draw = lerp(clr, clr0, clr1, map) || lerp(clr, clr1, clr2, map) || lerp(clr, clr2, clr3, map); } } /** * Function to linearly interpolate color values and draw to the Bitmap */ bool lerp(float clr, Pix& first, Pix& second, Bitmap & map){ if(clr <= second.val){ float diff = (clr - first.val) / (second.val - first.val); map.addColor( first.b + diff * (second.b - first.b), first.g + diff * (second.g - first.g), first.r + diff * (second.r - first.r) ); return true; } return false; } |
To view the full c++ code github link
This code is tested using the parameters Range R(-1.5, 1.5) I(-1, 1) MAXCYCLES 1000 Julia values = .72 * e^(i*θ): θ[0, 2π] : 100 intervals
Flat Profiles |
---|
Width: 750 Height: 500 Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls ns/call ns/call name 96.00 20.87 20.87 calcJulia(int*, int, int, float, float) 2.08 21.32 0.45 37500000 12.04 12.04 Bitmap::addColor(int, int, int) 1.75 21.70 0.38 49447183 7.71 16.84 lerp(float, Pix&, Pix&, Bitmap&) 0.46 21.80 0.10 createBMP(int*, int, int) 0.00 21.80 0.00 100 0.00 0.00 generateBitmapImage(unsigned char*, int, int, char const*) 0.00 21.80 0.00 100 0.00 0.00 createBitmapFileHeader(int, int, int) 0.00 21.80 0.00 100 0.00 0.00 createBitmapInfoHeader(int, int) 0.00 21.80 0.00 100 0.00 0.00 Bitmap::Bitmap(char const*, int, int) 0.00 21.80 0.00 100 0.00 0.00 Bitmap::~Bitmap() 0.00 21.80 0.00 1 0.00 0.00 _GLOBAL__sub_I_main Width: 1500 Height: 1000 Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls us/call us/call name 97.05 84.67 84.67 calcJulia(int*, int, int, float, float) 1.66 86.12 1.45 150000000 0.01 0.01 Bitmap::addColor(int, int, int) 1.12 87.09 0.97 197791886 0.00 0.01 lerp(float, Pix&, Pix&, Bitmap&) 0.37 87.41 0.32 createBMP(int*, int, int) 0.01 87.42 0.01 100 50.17 50.17 Bitmap::Bitmap(char const*, int, int) 0.00 87.42 0.00 100 0.00 0.00 generateBitmapImage(unsigned char*, int, int, char const*) 0.00 87.42 0.00 100 0.00 0.00 createBitmapFileHeader(int, int, int) 0.00 87.42 0.00 100 0.00 0.00 createBitmapInfoHeader(int, int) 0.00 87.42 0.00 100 0.00 0.00 Bitmap::~Bitmap() 0.00 87.42 0.00 1 0.00 0.00 _GLOBAL__sub_I_main Width: 2250 Height: 1500 Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls us/call us/call name 96.39 189.55 189.55 calcJulia(int*, int, int, float, float) 1.89 193.27 3.72 337500000 0.01 0.01 Bitmap::addColor(int, int, int) 1.49 196.20 2.93 445028641 0.01 0.01 lerp(float, Pix&, Pix&, Bitmap&) 0.45 197.09 0.89 createBMP(int*, int, int) 0.01 197.11 0.02 100 150.51 150.51 Bitmap::Bitmap(char const*, int, int) 0.00 197.11 0.00 100 0.00 0.00 generateBitmapImage(unsigned char*, int, int, char const*) 0.00 197.11 0.00 100 0.00 0.00 createBitmapFileHeader(int, int, int) 0.00 197.11 0.00 100 0.00 0.00 createBitmapInfoHeader(int, int) 0.00 197.11 0.00 100 0.00 0.00 Bitmap::~Bitmap() 0.00 197.11 0.00 1 0.00 0.00 _GLOBAL__sub_I_main Width: 3000 Height: 2000 Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls us/call us/call name 95.74 332.96 332.96 calcJulia(int*, int, int, float, float) 2.79 342.68 9.72 600000000 0.02 0.02 Bitmap::addColor(int, int, int) 1.28 347.14 4.46 791167185 0.01 0.02 lerp(float, Pix&, Pix&, Bitmap&) 0.44 348.66 1.52 createBMP(int*, int, int) 0.01 348.69 0.03 100 250.84 250.84 Bitmap::Bitmap(char const*, int, int) 0.00 348.69 0.00 100 0.00 0.00 generateBitmapImage(unsigned char*, int, int, char const*) 0.00 348.69 0.00 100 0.00 0.00 createBitmapFileHeader(int, int, int) 0.00 348.69 0.00 100 0.00 0.00 createBitmapInfoHeader(int, int) 0.00 348.69 0.00 100 0.00 0.00 Bitmap::~Bitmap() 0.00 348.69 0.00 1 0.00 0.00 _GLOBAL__sub_I_main |
Call Graphs |
---|
Width: 750 Height: 500 Call graph granularity: each sample hit covers 2 byte(s) for 0.05% of 21.80 seconds index % time self children called name <spontaneous> [1] 96.0 20.93 0.00 calcJulia(int*, int, int, float, float) [1] ----------------------------------------------- <spontaneous> [2] 4.0 0.14 0.73 createBMP(int*, int, int) [2] 0.33 0.40 49447183/49447183 lerp(float, Pix&, Pix&, Bitmap&) [3] 0.00 0.00 100/100 Bitmap::Bitmap(char const*, int, int) [15] 0.00 0.00 100/100 Bitmap::~Bitmap() [16] ----------------------------------------------- 0.33 0.40 49447183/49447183 createBMP(int*, int, int) [2] [3] 3.4 0.33 0.40 49447183 lerp(float, Pix&, Pix&, Bitmap&) [3] 0.40 0.00 37500000/37500000 Bitmap::addColor(int, int, int) [4] ----------------------------------------------- 0.40 0.00 37500000/37500000 lerp(float, Pix&, Pix&, Bitmap&) [3] [4] 1.8 0.40 0.00 37500000 Bitmap::addColor(int, int, int) [4] ----------------------------------------------- 0.00 0.00 100/100 Bitmap::~Bitmap() [16] [12] 0.0 0.00 0.00 100 generateBitmapImage(unsigned char*, int, int, char const*)[12] 0.00 0.00 100/100 createBitmapFileHeader(int, int, int) [13] 0.00 0.00 100/100 createBitmapInfoHeader(int, int) [14] ----------------------------------------------- 0.00 0.00 100/100 generateBitmapImage(unsigned char*, int, int, char const*) [12] [13] 0.0 0.00 0.00 100 createBitmapFileHeader(int, int, int) [13] ----------------------------------------------- 0.00 0.00 100/100 generateBitmapImage(unsigned char*, int, int, char const*) [12] [14] 0.0 0.00 0.00 100 createBitmapInfoHeader(int, int) [14] ----------------------------------------------- 0.00 0.00 100/100 createBMP(int*, int, int) [2] [15] 0.0 0.00 0.00 100 Bitmap::Bitmap(char const*, int, int) [15] ----------------------------------------------- 0.00 0.00 100/100 createBMP(int*, int, int) [2] [16] 0.0 0.00 0.00 100 Bitmap::~Bitmap() [16] 0.00 0.00 100/100 generateBitmapImage(unsigned char*, int, int, char const*) [12] ----------------------------------------------- 0.00 0.00 1/1 __libc_csu_init [25] [17] 0.0 0.00 0.00 1 _GLOBAL__sub_I_main [17] ----------------------------------------------- Index by function name [17] _GLOBAL__sub_I_main (julia.cpp) [3] lerp(float, Pix&, Pix&, Bitmap&) [15] Bitmap::Bitmap(char const*, int, int) [12] generateBitmapImage(unsigned char*, int, int, char const*) [1] calcJulia(int*, int, int, float, float) [16] Bitmap::~Bitmap() [13] createBitmapFileHeader(int, int, int) [2] createBMP(int*, int, int) [14] createBitmapInfoHeader(int, int) [4] Bitmap::addColor(int, int, int)
Width: 1500 Height: 1000 Call graph granularity: each sample hit covers 2 byte(s) for 0.01% of 87.23 seconds index % time self children called name <spontaneous> [1] 96.0 83.70 0.00 calcJulia(int*, int, int, float, float) [1] ----------------------------------------------- <spontaneous> [2] 4.0 0.37 3.16 createBMP(int*, int, int) [2] 1.39 1.76 197791886/197791886 lerp(float, Pix&, Pix&, Bitmap&) [3] 0.01 0.00 100/100 Bitmap::Bitmap(char const*, int, int) [5] 0.00 0.00 100/100 Bitmap::~Bitmap() [16] ----------------------------------------------- 1.39 1.76 197791886/197791886 createBMP(int*, int, int) [2] [3] 3.6 1.39 1.76 197791886 lerp(float, Pix&, Pix&, Bitmap&) [3] 1.76 0.00 150000000/150000000 Bitmap::addColor(int, int, int) [4] ----------------------------------------------- 1.76 0.00 150000000/150000000 lerp(float, Pix&, Pix&, Bitmap&) [3] [4] 2.0 1.76 0.00 150000000 Bitmap::addColor(int, int, int) [4] ----------------------------------------------- 0.01 0.00 100/100 createBMP(int*, int, int) [2] [5] 0.0 0.01 0.00 100 Bitmap::Bitmap(char const*, int, int) [5] ----------------------------------------------- 0.00 0.00 100/100 Bitmap::~Bitmap() [16] [13] 0.0 0.00 0.00 100 generateBitmapImage(unsigned char*, int, int, char const*) [13] 0.00 0.00 100/100 createBitmapFileHeader(int, int, int) [14] 0.00 0.00 100/100 createBitmapInfoHeader(int, int) [15] ----------------------------------------------- 0.00 0.00 100/100 generateBitmapImage(unsigned char*, int, int, char const*) [13] [14] 0.0 0.00 0.00 100 createBitmapFileHeader(int, int, int) [14] ----------------------------------------------- 0.00 0.00 100/100 generateBitmapImage(unsigned char*, int, int, char const*) [13] [15] 0.0 0.00 0.00 100 createBitmapInfoHeader(int, int) [15] ----------------------------------------------- 0.00 0.00 100/100 createBMP(int*, int, int) [2] [16] 0.0 0.00 0.00 100 Bitmap::~Bitmap() [16] 0.00 0.00 100/100 generateBitmapImage(unsigned char*, int, int, char const*) [13] ----------------------------------------------- 0.00 0.00 1/1 __libc_csu_init [25] [17] 0.0 0.00 0.00 1 _GLOBAL__sub_I_main [17] ----------------------------------------------- Index by function name [17] _GLOBAL__sub_I_main (julia.cpp) [3] lerp(float, Pix&, Pix&, Bitmap&) [5] Bitmap::Bitmap(char const*, int, int) [13] generateBitmapImage(unsigned char*, int, int, char const*) [1] calcJulia(int*, int, int, float, float) [16] Bitmap::~Bitmap() [14] createBitmapFileHeader(int, int, int) [2] createBMP(int*, int, int) [15] createBitmapInfoHeader(int, int) [4] Bitmap::addColor(int, int, int)
Width: 2250 Height: 1500 Call graph granularity: each sample hit covers 2 byte(s) for 0.01% of 196.08 seconds index % time self children called name <spontaneous> [1] 96.0 188.22 0.00 calcJulia(int*, int, int, float, float) [1] ----------------------------------------------- <spontaneous> [2] 4.0 0.95 6.91 createBMP(int*, int, int) [2] 2.78 4.12 445028641/445028641 lerp(float, Pix&, Pix&, Bitmap&) [3] 0.00 0.01 100/100 Bitmap::~Bitmap() [6] 0.01 0.00 100/100 Bitmap::Bitmap(char const*, int, int) [7] ----------------------------------------------- 2.78 4.12 445028641/445028641 createBMP(int*, int, int) [2] [3] 3.5 2.78 4.12 445028641 lerp(float, Pix&, Pix&, Bitmap&) [3] 4.12 0.00 337500000/337500000 Bitmap::addColor(int, int, int) [4] ----------------------------------------------- 4.12 0.00 337500000/337500000 lerp(float, Pix&, Pix&, Bitmap&) [3] [4] 2.1 4.12 0.00 337500000 Bitmap::addColor(int, int, int) [4] ----------------------------------------------- 0.01 0.00 100/100 Bitmap::~Bitmap() [6] [5] 0.0 0.01 0.00 100 generateBitmapImage(unsigned char*, int, int, char const*) [5] 0.00 0.00 100/100 createBitmapFileHeader(int, int, int) [15] 0.00 0.00 100/100 createBitmapInfoHeader(int, int) [16] ----------------------------------------------- 0.00 0.01 100/100 createBMP(int*, int, int) [2] [6] 0.0 0.00 0.01 100 Bitmap::~Bitmap() [6] 0.01 0.00 100/100 generateBitmapImage(unsigned char*, int, int, char const*) [5] ----------------------------------------------- 0.01 0.00 100/100 createBMP(int*, int, int) [2] [7] 0.0 0.01 0.00 100 Bitmap::Bitmap(char const*, int, int) [7] ----------------------------------------------- 0.00 0.00 100/100 generateBitmapImage(unsigned char*, int, int, char const*) [5] [15] 0.0 0.00 0.00 100 createBitmapFileHeader(int, int, int) [15] ----------------------------------------------- 0.00 0.00 100/100 generateBitmapImage(unsigned char*, int, int, char const*) [5] [16] 0.0 0.00 0.00 100 createBitmapInfoHeader(int, int) [16] ----------------------------------------------- 0.00 0.00 1/1 __libc_csu_init [25] [17] 0.0 0.00 0.00 1 _GLOBAL__sub_I_main [17] ----------------------------------------------- Index by function name [17] _GLOBAL__sub_I_main (julia.cpp) [3] lerp(float, Pix&, Pix&, Bitmap&) [7] Bitmap::Bitmap(char const*, int, int) [5] generateBitmapImage(unsigned char*, int, int, char const*) [1] calcJulia(int*, int, int, float, float) [6] Bitmap::~Bitmap() [15] createBitmapFileHeader(int, int, int) [2] createBMP(int*, int, int) [16] createBitmapInfoHeader(int, int) [4] Bitmap::addColor(int, int, int) Width: 3000 Height: 2000 Call graph granularity: each sample hit covers 2 byte(s) for 0.00% of 347.05 seconds index % time self children called name <spontaneous> [1] 96.1 333.37 0.00 calcJulia(int*, int, int, float, float) [1] ----------------------------------------------- <spontaneous> [2] 3.9 1.21 12.47 createBMP(int*, int, int) [2] 5.74 6.73 791167185/791167185 lerp(float, Pix&, Pix&, Bitmap&) [3] 0.01 0.00 100/100 Bitmap::Bitmap(char const*, int, int) [5] 0.00 0.00 100/100 Bitmap::~Bitmap() [16] ----------------------------------------------- 5.74 6.73 791167185/791167185 createBMP(int*, int, int) [2] [3] 3.6 5.74 6.73 791167185 lerp(float, Pix&, Pix&, Bitmap&) [3] 6.73 0.00 600000000/600000000 Bitmap::addColor(int, int, int) [4] ----------------------------------------------- 6.73 0.00 600000000/600000000 lerp(float, Pix&, Pix&, Bitmap&) [3] [4] 1.9 6.73 0.00 600000000 Bitmap::addColor(int, int, int) [4] ----------------------------------------------- 0.01 0.00 100/100 createBMP(int*, int, int) [2] [5] 0.0 0.01 0.00 100 Bitmap::Bitmap(char const*, int, int) [5] ----------------------------------------------- 0.00 0.00 100/100 Bitmap::~Bitmap() [16] [13] 0.0 0.00 0.00 100 generateBitmapImage(unsigned char*, int, int, char const*) [13] 0.00 0.00 100/100 createBitmapFileHeader(int, int, int) [14] 0.00 0.00 100/100 createBitmapInfoHeader(int, int) [15] ----------------------------------------------- 0.00 0.00 100/100 generateBitmapImage(unsigned char*, int, int, char const*) [13] [14] 0.0 0.00 0.00 100 createBitmapFileHeader(int, int, int) [14] ----------------------------------------------- 0.00 0.00 100/100 generateBitmapImage(unsigned char*, int, int, char const*) [13] [15] 0.0 0.00 0.00 100 createBitmapInfoHeader(int, int) [15] ----------------------------------------------- 0.00 0.00 100/100 createBMP(int*, int, int) [2] [16] 0.0 0.00 0.00 100 Bitmap::~Bitmap() [16] 0.00 0.00 100/100 generateBitmapImage(unsigned char*, int, int, char const*) [13] ----------------------------------------------- 0.00 0.00 1/1 __libc_csu_init [25] [17] 0.0 0.00 0.00 1 _GLOBAL__sub_I_main [17] ----------------------------------------------- Index by function name [17] _GLOBAL__sub_I_main (julia.cpp) [3] lerp(float, Pix&, Pix&, Bitmap&) [5] Bitmap::Bitmap(char const*, int, int) [13] generateBitmapImage(unsigned char*, int, int, char const*) [1] calcJulia(int*, int, int, float, float) [16] Bitmap::~Bitmap() [14] createBitmapFileHeader(int, int, int) [2] createBMP(int*, int, int) [15] createBitmapInfoHeader(int, int) [4] Bitmap::addColor(int, int, int) |
Generated Image of Julia set at (-0.4, 0.6) |
---|
Assignment 2
Code for Solving a Sudoku using backtracking |
---|
#include <stdio.h> #include <iostream> #include <cstdlib> #include <ctime> #include <iomanip> // CUDA header file #include "cuda_runtime.h" #include "device_launch_parameters.h" // A Backtracking program in C++ to solve Sudoku problem #include <stdio.h> // UNASSIGNED is used for empty cells in sudoku grid #define UNASSIGNED 0 // N is used for the size of Sudoku grid. Size will be NxN #define BOXWIDTH 3 #define N 9 // This function finds an entry in grid that is still unassigned bool FindUnassignedLocation(int grid[N][N], int &row, int &col); // Checks whether it will be legal to assign num to the given row, col __global__ void makeGuess(int* d_a, int guess, int row, int col) { __shared__ bool found; bool control = !(threadIdx.x || threadIdx.y); int tidx = 0; if (threadIdx.y == 2) { //BOXES tidx = (((row / BOXWIDTH) * BOXWIDTH) + threadIdx.x / BOXWIDTH) * N + threadIdx.x % BOXWIDTH + (col / BOXWIDTH) * BOXWIDTH; } else if (threadIdx.y == 1) { //ROWS tidx = row * N + threadIdx.x; } else { //COLUMNS tidx = threadIdx.x * N + col; } if (control) found = true; __syncthreads(); while (found) { __syncthreads(); if (control) found = false; __syncthreads(); if (d_a[tidx] == ++guess) found = true; __syncthreads(); } if (control) d_a[row * N + col] = guess; } /* Takes a partially filled-in grid and attempts to assign values to all unassigned locations in such a way to meet the requirements for Sudoku solution (non-duplication across rows, columns, and boxes) */ bool SolveSudoku(int grid[N][N], int* d_a, int row, int col) { // If there is no unassigned location, we are done if (!FindUnassignedLocation(grid, row, col)) return true; // success! dim3 block(N, 3); int guess = 0; while (guess <= N) { makeGuess << <1, block >> > (d_a, guess, row, col); cudaMemcpy(&guess, d_a + row * N + col, sizeof(int), cudaMemcpyDeviceToHost); if (guess <= N && SolveSudoku(grid, d_a, row, col + 1)) return true; } //Erase the guess on the host int zero = UNASSIGNED; cudaMemcpy(d_a + row * N + col, &zero, sizeof(int), cudaMemcpyHostToDevice); return false; // this triggers backtracking } /* Searches the grid to find an entry that is still unassigned. If found, the reference parameters row, col will be set the location that is unassigned, and true is returned. If no unassigned entries remain, false is returned. */ bool FindUnassignedLocation(int grid[N][N], int &row, int &col) { for (; row < N; ++row) { for (; col < N; ++col) if (grid[row][col] == UNASSIGNED) return true; col = 0; } return false; } /* A utility function to print grid */ void printGrid(int grid[N][N]) { for (int row = 0; row < N; row++){ for (int col = 0; col < N; col++) printf("%2d", grid[row][col]); printf("\n"); } } /* Driver Program to test above functions */ int main() { /* 0 means unassigned cells */ int grid[N][N] = {{3, 0, 6, 5, 0, 8, 4, 0, 0}, {5, 2, 0, 0, 0, 0, 0, 0, 0}, {0, 8, 7, 0, 0, 0, 0, 3, 1}, {0, 0, 3, 0, 1, 0, 0, 8, 0}, {9, 0, 0, 8, 6, 3, 0, 0, 5}, {0, 5, 0, 0, 9, 0, 6, 0, 0}, {1, 3, 0, 0, 0, 0, 2, 5, 0}, {0, 0, 0, 0, 0, 0, 0, 7, 4}, {0, 0, 5, 2, 0, 6, 3, 0, 0} }; /** int grid[N][N] = {{0, 8, 0, 0, 0, 0, 0, 3, 0, 0, 0, 10, 9, 7, 11, 0}, {0, 9, 15, 13, 0, 10, 0, 0, 2, 6, 8, 16, 0, 0, 0, 0}, {0, 0, 16, 0, 15, 0, 8, 0, 9, 0, 0, 0, 6, 0, 2, 0}, {1, 0, 2, 0, 9, 11, 4, 6, 15, 3, 5, 7, 0, 0, 12, 0}, {16, 6, 4, 0, 5, 2, 0, 0, 1, 0, 0, 0, 11, 0, 0, 12}, {5, 11, 0, 0, 0, 3, 0, 15, 0, 16, 0, 13, 0, 1, 0, 8}, {0, 0, 3, 0, 0, 6, 11, 14, 0, 5, 7, 0, 0, 9, 0, 0}, {0, 0, 0, 14, 8, 0, 10, 0, 0, 11, 12, 0, 0, 0, 0, 0}, {0, 7, 13, 0, 0, 0, 0, 12, 0, 8, 9, 0, 0, 0, 3, 0}, {0, 0, 11, 9, 0, 7, 0, 0, 0, 0, 0, 12, 0, 8, 16, 5}, {0, 0, 10, 0, 11, 13, 0, 0, 0, 0, 0, 3, 12, 0, 6, 0}, {0, 5, 0, 0, 10, 15, 0, 1, 7, 2, 0, 0, 14, 11, 0, 0}, {0, 0, 5, 0, 0, 12, 14, 0, 0, 10, 0, 0, 15, 0, 0, 4}, {9, 0, 14, 6, 0, 0, 1, 0, 16, 0, 2, 0, 3, 0, 13, 0}, {8, 13, 0, 4, 0, 0, 0, 0, 12, 7, 3, 0, 0, 6, 0, 0}, {0, 16, 12, 0, 0, 5, 0, 9, 0, 13, 14, 4, 1, 0, 0, 0} }; /**/ int* d_a; cudaMalloc((void**)&d_a, N*N * sizeof(int)); cudaMemcpy(d_a, grid, N*N * sizeof(int), cudaMemcpyHostToDevice); SolveSudoku(grid, d_a, 0, 0); cudaMemcpy(grid, d_a, N*N * sizeof(int), cudaMemcpyDeviceToHost); printGrid(grid); cudaFree(d_a); return 0; } |
This code is capable of solving the 9x9 matrix supplied HOWEVER with the backtracking algorithm substituting values and the communications delay between the GPU and CPU, This code is unable to solve the 16x16 in any reasonable amount of time (I stopped it at 10+ minutes). If you consider the 130+ empty spaces in the grid I estimate over 130^2 calls to cudaMemcpy either way...
Faster than Yours... |
---|
/** * Vincent Terpstra * Sudoku.cu * March 18 / 2019 * An Optimistic approach to solving a Sudoku on a CUDA enabled GPU * Assumes that the puzzle is deterministic(single solvable solution) * AND each next step can be found with the kernel * KERNEL: educatedGuess * searches each square in a box for * squares that have only a single appropiate value * OR values that (in the box) can only fit in one square */ #include <stdio.h> #include <iostream> #include <cstdlib> #include <ctime> #include <iomanip> // CUDA header file #include "cuda_runtime.h" #include <device_launch_parameters.h> #ifndef __CUDACC__ #define __CUDACC__ #endif #include <device_functions.h> #include <stdio.h> // UNASSIGNED is used for empty cells in sudoku grid #define UNASSIGNED 0 // N is used for the size of Sudoku grid. Size will be NxN #define BOXWIDTH 5 #define N (BOXWIDTH * BOXWIDTH) /* * kernel to solve a sudoku * Input: sudoku puzzle partitioned into boxes * * d_a = the sudoku puzzle * figures out what values can fit in each square * figures out how many spots each value can go * assigns the appropiate values, * saves to addedIdx to show that there is a change */ __global__ void educatedGuess(int * d_a, int * addedIdx) { int idx = threadIdx.x + BOXWIDTH * threadIdx.y; int gridX = threadIdx.x + BOXWIDTH * blockIdx.x; int gridY = threadIdx.y + BOXWIDTH * blockIdx.y; int gridIdx = gridX + N * gridY; __shared__ bool hasValue[N]; //If the value occurs in the box __shared__ int inBox[N]; //Number of places each integer can go in the box hasValue[idx] = false; inBox[idx] = 0; __syncthreads(); int at = d_a[gridIdx]; if (at != 0) hasValue[at - 1] = true; __syncthreads(); if (at != 0) return; //For remembering which values were seen in the rows and columns bool foundVal[N]; for (int i = 0; i < N; ++i) foundVal[i] = hasValue[i]; for (int check = 0; check < N; check++) { foundVal[d_a[N * check + gridX] - 1] = true; foundVal[d_a[N * gridY + check] - 1] = true; } int fndVals = 0; for( int i = 0; i < N; ++i) if (!foundVal[i]) { fndVals++; at = i + 1; } if (fndVals == 1) { //Only one possible value for this index d_a[gridIdx] = at; //assign value addedIdx[0] = gridIdx; //to tell host that the table has changed inBox[at - 1] = 4; //Prevent one index per value } __syncthreads(); //Calculate the number of places each integer can go in the box for (int i = 0; i < N; ++i) { int num = (idx + i) % N; //keep each thread on a seperate idx if (!foundVal[num]) inBox[num]++; __syncthreads(); } for (int i = 0; i < N; ++i) { //if there is only one possible index for that value assign the value if (inBox[i] == 1 && !foundVal[i]) { d_a[gridIdx] = i + 1; //assign value addedIdx[0] = gridIdx; //to tell host that the table has changed } } } /* Solves the Sudoku, with best values */ void SolveSudoku(int grid[N][N], int* d_a, int* d_results) { dim3 block(BOXWIDTH, BOXWIDTH); int lastIdx(-1), nextIdx(-1); do { lastIdx = nextIdx; educatedGuess << <block, block >> > (d_a, d_results); cudaMemcpy(&nextIdx, d_results, sizeof(int), cudaMemcpyDeviceToHost); } while (lastIdx != nextIdx); } /* A utility function to print grid */ void printGrid(int grid[N][N]) { for (int row = 0; row < N; row++) { for (int col = 0; col < N; col++) printf("%3d", grid[row][col]); printf("\n"); } } /* Driver Program to test above functions */ int main() { /* 0 means unassigned cells * int grid[N][N] = { {3, 0, 6, 5, 0, 8, 4, 0, 0}, {5, 2, 0, 0, 0, 0, 0, 0, 0}, {0, 8, 7, 0, 0, 0, 0, 3, 1}, {0, 0, 3, 0, 1, 0, 0, 8, 0}, {9, 0, 0, 8, 6, 3, 0, 0, 5}, {0, 5, 0, 0, 9, 0, 6, 0, 0}, {1, 3, 0, 0, 0, 0, 2, 5, 0}, {0, 0, 0, 0, 0, 0, 0, 7, 4}, {0, 0, 5, 2, 0, 6, 3, 0, 0} }; /** int grid[N][N] = {{0, 8, 0, 0, 0, 0, 0, 3, 0, 0, 0, 10, 9, 7, 11, 0}, {0, 9, 15, 13, 0, 10, 0, 0, 2, 6, 8, 16, 0, 0, 0, 0}, {0, 0, 16, 0, 15, 0, 8, 0, 9, 0, 0, 0, 6, 0, 2, 0}, {1, 0, 2, 0, 9, 11, 4, 6, 15, 3, 5, 7, 0, 0, 12, 0}, {16, 6, 4, 0, 5, 2, 0, 0, 1, 0, 0, 0, 11, 0, 0, 12}, {5, 11, 0, 0, 0, 3, 0, 15, 0, 16, 0, 13, 0, 1, 0, 8}, {0, 0, 3, 0, 0, 6, 11, 14, 0, 5, 7, 0, 0, 9, 0, 0}, {0, 0, 0, 14, 8, 0, 10, 0, 0, 11, 12, 0, 0, 0, 0, 0}, {0, 7, 13, 0, 0, 0, 0, 12, 0, 8, 9, 0, 0, 0, 3, 0}, {0, 0, 11, 9, 0, 7, 0, 0, 0, 0, 0, 12, 0, 8, 16, 5}, {0, 0, 10, 0, 11, 13, 0, 0, 0, 0, 0, 3, 12, 0, 6, 0}, {0, 5, 0, 0, 10, 15, 0, 1, 7, 2, 0, 0, 14, 11, 0, 0}, {0, 0, 5, 0, 0, 12, 14, 0, 0, 10, 0, 0, 15, 0, 0, 4}, {9, 0, 14, 6, 0, 0, 1, 0, 16, 0, 2, 0, 3, 0, 13, 0}, {8, 13, 0, 4, 0, 0, 0, 0, 12, 7, 3, 0, 0, 6, 0, 0}, {0, 16, 12, 0, 0, 5, 0, 9, 0, 13, 14, 4, 1, 0, 0, 0} }; /**/ int grid[N][N] = { {1, 0, 4, 0, 25, 0, 19, 0, 0, 10, 21, 8, 0, 14, 0, 6, 12, 9, 0, 0, 0, 0, 0, 0, 5},{5, 0, 19, 23, 24, 0, 22, 12, 0, 0, 16, 6, 0, 20, 0, 18, 0, 25, 14, 13, 10, 11, 0, 1, 15},{0, 0, 0, 0, 0, 0, 21, 5, 0, 20, 11, 10, 0, 1, 0, 4, 8, 24, 23, 15, 18, 0, 16, 22, 19}, {0, 7, 21, 8, 18, 0, 0, 0, 11, 0, 5, 0, 0, 24, 0, 0, 0, 17, 22, 1, 9, 6, 25, 0, 0}, {0, 13, 15, 0, 22, 14, 0, 18, 0, 16, 0, 0, 0, 4, 0, 0, 0, 19, 0, 0, 0, 24, 20, 21, 17}, {12, 0, 11, 0, 6, 0, 0, 0, 0, 15, 0, 0, 0, 0, 21, 25, 19, 0, 4, 0, 22, 14, 0, 20, 0}, {8, 0, 0, 21, 0, 16, 0, 0, 0, 2, 0, 3, 0, 0, 0, 0, 17, 23, 18, 22, 0, 0, 0, 24, 6}, {4, 0, 14, 18, 7, 9, 0, 22, 21, 19, 0, 0, 0, 2, 0, 5, 0, 0, 0, 6, 16, 15, 0, 11, 12}, {22, 0, 24, 0, 23, 0, 0, 11, 0, 7, 0, 0, 4, 0, 14, 0, 2, 12, 0, 8, 5, 19, 0, 25, 9}, {20, 0, 0, 0, 5, 0, 0, 0, 0, 17, 9, 0, 12, 18, 0, 1, 0, 0, 7, 24, 0, 0, 0, 13, 4}, {13, 0, 0, 5, 0, 2, 23, 14, 4, 18, 22, 0, 17, 0, 0, 20, 0, 1, 9, 21, 12, 0, 0, 8, 11}, {14, 23, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 20, 25, 0, 3, 4, 13, 0, 11, 21, 9, 5, 18, 22}, {7, 0, 0, 11, 17, 20, 24, 0, 0, 0, 3, 4, 1, 12, 0, 0, 6, 14, 0, 5, 25, 13, 0, 0, 0}, {0, 0, 16, 9, 0, 17, 11, 7, 10, 25, 0, 0, 0, 13, 6, 0, 0, 18, 0, 0, 19, 4, 0, 0, 20}, {6, 15, 0, 19, 4, 13, 0, 0, 5, 0, 18, 11, 0, 0, 9, 8, 22, 16, 25, 10, 7, 0, 0, 0, 0}, {0, 0, 0, 2, 0, 0, 10, 19, 3, 0, 1, 0, 22, 9, 4, 11, 15, 0, 20, 0, 0, 8, 23, 0, 25}, {0, 24, 8, 13, 1, 0, 0, 4, 20, 0, 17, 14, 0, 0, 18, 0, 16, 22, 5, 0, 11, 0, 10, 0, 0}, {23, 10, 0, 0, 0, 0, 0, 0, 18, 0, 6, 0, 16, 0, 0, 17, 1, 0, 13, 0, 0, 3, 19, 12, 0}, {25, 5, 0, 14, 11, 0, 17, 0, 8, 24, 13, 0, 19, 23, 15, 9, 0, 0, 12, 0, 20, 0, 22, 0, 7}, {0, 0, 17, 4, 0, 22, 15, 0, 23, 11, 12, 25, 0, 0, 0, 0, 18, 8, 0, 7, 0, 0, 14, 0, 13}, {19, 6, 23, 22, 8, 0, 0, 1, 25, 4, 14, 2, 0, 3, 7, 13, 10, 11, 16, 0, 0, 0, 0, 0, 0}, {0, 4, 0, 17, 0, 3, 0, 24, 0, 8, 20, 23, 11, 10, 25, 22, 0, 0, 0, 12, 13, 2, 18, 6, 0}, {0, 0, 7, 16, 0, 0, 6, 17, 2, 21, 0, 18, 0, 0, 0, 19, 0, 0, 8, 0, 0, 0, 0, 4, 0}, {18, 9, 25, 1, 2, 11, 0, 0, 13, 22, 4, 0, 21, 0, 5, 0, 23, 7, 0, 0, 15, 0, 3, 0, 8}, {0, 21, 10, 0, 0, 12, 0, 20, 16, 0, 19, 0, 0, 0, 0, 15, 14, 4, 2, 18, 23, 25, 11, 7, 0} }; /**/ int* d_a; //Table int* d_result; //Table change indicator cudaMalloc((void**)&d_a, N*N * sizeof(int)); cudaMalloc((void**)&d_result, sizeof(int)); //Copy Sudoku over cudaMemcpy(d_a, grid, N*N * sizeof(int), cudaMemcpyHostToDevice); SolveSudoku(grid, d_a, d_result); //Copy Sudoku back cudaMemcpy(grid, d_a, N*N * sizeof(int), cudaMemcpyDeviceToHost); printGrid(grid); cudaFree(d_a); cudaFree(d_result); return 0; } |
Single Pass Sudoku Solver
__global__ void superSolve(int * d_a) { //Used to remember which row | col | box ( section ) have which values __shared__ bool rowHas[N][N]; __shared__ bool colHas[N][N]; __shared__ bool boxHas[N][N]; //Used to ensure that the table has changed __shared__ int added, past; //Number of spaces which can place the number in each section __shared__ int rowCount[N][N]; __shared__ int colCount[N][N]; __shared__ int boxCount[N][N]; //Where the square is located in the Sudoku int row = threadIdx.x; int col = threadIdx.y; int box = row / BOXWIDTH + (col / BOXWIDTH) * BOXWIDTH; //Unique identifier for each square in row, col, box //Corresponds to the generic Sudoku Solve //Using a Sudoku to solve a Sudoku !!! int offset = col + (row % BOXWIDTH) * BOXWIDTH + (box % BOXWIDTH); //Square's location in the Sudoku int gridIdx = col * N + row; int at = d_a[gridIdx]; if (!gridIdx) { //Thread at 0,0 sets values added = -1; past = -2; } rowHas[col][row] = false; colHas[col][row] = false; boxHas[col][row] = false; __syncthreads(); if (at != UNASSIGNED) { rowHas[row][at - 1] = true; colHas[col][at - 1] = true; boxHas[box][at - 1] = true; } //Previous loop has not changed any values while (added != past) { //RESET counters rowCount[col][row] = 0; colCount[col][row] = 0; boxCount[col][row] = 0; __syncthreads(); if (!gridIdx) //forget previous change past = added; int count = 0; //number of values which can fit in this square int guess = at; //last value found which can fit in this square for (int idx = 0; idx < N; ++idx) { //Ensures that every square in each section is working on a different number in the section int num = (idx + offset) % N; if (at == UNASSIGNED && !(rowHas[row][num] || colHas[col][num] || boxHas[box][num])) { count++; guess = num + 1; rowCount[row][num] ++; colCount[col][num] ++; boxCount[box][num] ++; } __syncthreads(); } //Only ONE value can fit in this spot if (count == 1) { at = guess--; d_a[gridIdx] = at; rowHas[row][guess] = true; colHas[col][guess] = true; boxHas[box][guess] = true; added = gridIdx; } __syncthreads(); if (at == UNASSIGNED) { //Find values which can go in only one spot in the section for (int idx = 0; idx < N; ++idx) { if (!(rowHas[row][idx] || colHas[col][idx] || boxHas[box][idx]) && (boxCount[box][idx] == 1 || rowCount[row][idx] == 1 || colCount[col][idx] == 1)) { //In this section this value can only appear in this square at = idx + 1; d_a[gridIdx] = at; rowHas[row][idx] = true; colHas[col][idx] = true; boxHas[box][idx] = true; added = gridIdx; } } } __syncthreads(); } }
Assignment 3
Optimized kernel : Single Pass |
---|
__global__ void superSolve(int * d_a) { //Used to remember which row | col | box ( section ) have which values __shared__ bool rowHas[N][N]; __shared__ bool colHas[N][N]; __shared__ bool boxHas[N][N]; //Used to ensure that the table has changed __shared__ int added, past; //Number of spaces which can place the number in each section __shared__ int rowCount[N][N]; __shared__ int colCount[N][N]; __shared__ int boxCount[N][N]; //Use registry memory to remember what values were seen bool notSeen[N]; //Where the square is located in the Sudoku int row = threadIdx.x; int col = threadIdx.y; int box = row / BOXWIDTH + (col / BOXWIDTH) * BOXWIDTH; //Unique identifier for each square in row, col, box //Corresponds to the generic Sudoku Solve //Using a Sudoku to solve a Sudoku !!! int offset = col + (row % BOXWIDTH) * BOXWIDTH + (box % BOXWIDTH); //Square's location in the Sudoku int gridIdx = col * N + row; int at = d_a[gridIdx]; if (gridIdx == 0)//Thread at 0,0 sets values added = -1; rowHas[row][col] = false; colHas[row][col] = false; boxHas[row][col] = false; for (int i = 0; i < N; i++) notSeen[i] = true; __syncthreads(); if (at != UNASSIGNED) { rowHas[row][at - 1] = true; colHas[col][at - 1] = true; boxHas[box][at - 1] = true; } do { //RESET counters rowCount[row][col] = 0; colCount[row][col] = 0; boxCount[row][col] = 0; __syncthreads(); if (gridIdx == 0) //forget previous change past = added; int count = 0; //number of values which can fit in this square int guess = 0; //last value found which can fit in this square for (int idx = 0; idx < N; ++idx) { //Ensures that every square in each section is working on a different number in the section int num = (idx + offset) % N; if (at == UNASSIGNED && notSeen[num]){ if (rowHas[row][num] || boxHas[box][num] || colHas[col][num]) { notSeen[num] = false; } else { count++; guess = num; rowCount[row][num] ++; colCount[col][num] ++; boxCount[box][num] ++; } } __syncthreads(); } //Find values which can go in only one spot in the section for (int idx = 0; idx < N && count > 1; ++idx) { if (notSeen[idx] && ( rowCount[row][idx] == 1 || boxCount[box][idx] || colCount[col][idx] == 1)) { count = 1; guess = idx; } }
//One value Must go here if (count == 1) { at = guess + 1; rowHas[row][guess] = true; colHas[col][guess] = true; boxHas[box][guess] = true; added = gridIdx; } __syncthreads(); //Previous loop has not changed any values } while (added != past); d_a[gridIdx] = at; } |
Reduced superSolve runtime from 5.1 to 1.2ms
Changes: Used faster memory -each thread now remembers which values it has seen in a boolean array Reduced Thread Divergence/CGMA - values are only assigned to the grid after the kernel 'solves' the sudoku removing wait times for assigning to global memory - at value in kernel and rowHas, colHas, boxHas, updated in a single place, reducing wait time for updating Coalesced Memory - change modifying _Has and _Count arrays from col->row to row->col as row(threadIdx.x) is our fastest moving dimension - query in order of row->box->col for the same reason Clarified Code - use gridIdx == 0 rather then !gridIdx - use a do-while loop rather then a while loop