112
edits
Changes
TriForce
,→Kernel Optimization Attempts
{{GPU610/DPS915 Index | 20191}}
= randomName'); DROP TABLE projects;-- TriForce =
== Team Members ==
# [mailto:dpferri@myseneca.ca?subject=gpu610 David Ferri], Some responsibility Sudoku Solver # [mailto:vterpstra@myseneca.ca?subject=gpu610 Vincent Terpstra], Some other responsibilityJulia Sets# [mailto:rkiguru@myseneca.ca?subject=gpu610 Raymond Kiguru], Responsibility++EasyBMP
[mailto:dpferri@myseneca.ca;vterpstra@myseneca.ca;rkiguru@myseneca.ca?subject=gpu610 Email All]
Sudoku Solver Profiling
Rather than try to continuously increase the difficulty of a 9x9 sudoku, I decided to modify the program I found to handle larger and large sudokus, increasing the size of the matrices that make up the sudoku (starting with a 9x9 sudoku, which is 9 3x3 matrices, then 16x16 which is 16 4x4 matrices, and finally 25x25 which is 25 5x5 matrices) without changing the logic of the program (only constants), so larger sudokus are solved the same way as a normal one.
Source code from: https://www.geeksforgeeks.org/sudoku-backtracking-7/
{| class="wikitable mw-collapsible mw-collapsed"
! Original Code:
|-
|
// A Backtracking program in C++ to solve Sudoku problem
/* Check if 'num' is not already placed in current row,
current column and current 3x3 box */
return !UsedInRow(grid, row, num) && !UsedInCol(grid, col, num) && !UsedInBox(grid, row - row%3 , col - col%3, num)&& grid[row][col]==UNASSIGNED;
}
int grid[N][N] = {{0, 8, 0, 0, 0, 0, 0, 3, 0, 0, 0, 10, 9, 7, 11, 0},
{0, 9, 15, 13, 0, 10, 0, 0, 2, 6, 8, 16, 0, 0, 0, 0},
{0, 0, 0, 14, 8, 0, 10, 0, 0, 11, 12, 0, 0, 0, 0, 0},
{0, 7, 13, 0, 0, 0, 0, 12, 0, 8, 9, 0, 0, 0, 3, 0},
int grid[N][N] = {{1, 0, 4, 0, 25, 0, 19, 0, 0, 10, 21, 8, 0, 14, 0, 6, 12, 9, 0, 0, 0, 0, 0, 0, 5},
{5, 0, 19, 23, 24, 0, 22, 12, 0, 0, 16, 6, 0, 20, 0, 18, 0, 25, 14, 13, 10, 11, 0, 1, 15},
{4, 0, 14, 18, 7, 9, 0, 22, 21, 19, 0, 0, 0, 2, 0, 5, 0, 0, 0, 6, 16, 15, 0, 11, 12},
{22, 0, 24, 0, 23, 0, 0, 11, 0, 7, 0, 0, 4, 0, 14, 0, 2, 12, 0, 8, 5, 19, 0, 25, 9},
{6, 15, 0, 19, 4, 13, 0, 0, 5, 0, 18, 11, 0, 0, 9, 8, 22, 16, 25, 10, 7, 0, 0, 0, 0},
{0, 0, 0, 2, 0, 0, 10, 19, 3, 0, 1, 0, 22, 9, 4, 11, 15, 0, 20, 0, 0, 8, 23, 0, 25},
{0, 4, 0, 17, 0, 3, 0, 24, 0, 8, 20, 23, 11, 10, 25, 22, 0, 0, 0, 12, 13, 2, 18, 6, 0},
{0, 0, 7, 16, 0, 0, 6, 17, 2, 21, 0, 18, 0, 0, 0, 19, 0, 0, 8, 0, 0, 0, 0, 4, 0},
{18, 9, 25, 1, 2, 11, 0, 0, 13, 22, 4, 0, 21, 0, 5, 0, 23, 7, 0, 0, 15, 0, 3, 0, 8},
{0, 21, 10, 0, 0, 12, 0, 20, 16, 0, 19, 0, 0, 0, 0, 15, 14, 4, 2, 18, 23, 25, 11, 7, 0}};
if (SolveSudoku(grid) == true)
printGrid(grid);
else
printf("No solution exists");
return 0;
}
|}
Obtaining flat profiles and call graphs on matrix environment:
$ g++ sudokuC.cpp -std=c++0x -o Sudoku
$ ./Sudoku
3 1 6 5 7 8 4 9 2
5 2 9 1 3 4 7 6 8
4 8 7 6 2 9 5 3 1
2 6 3 4 1 5 9 8 7
9 7 4 8 6 3 1 2 5
8 5 1 7 9 2 6 4 3
1 3 8 9 4 7 2 5 6
6 9 2 3 5 1 8 7 4
7 4 5 2 8 6 3 1 9
$ gprof -p -b ./Sudoku gmon.out > 9x9.flt
$ gprof -q -b ./Sudoku gmon.out > 9x9.clg
$ g++ sudokuC16.cpp -std=c++0x -pg -o Sudoku16
$ ./Sudoku16
12 8 6 516 1 2 31314 410 9 71115
11 915131210 7 5 2 6 816 414 1 3
4 316 71514 813 91211 1 6 5 210
114 210 911 4 615 3 5 7 8131216
16 6 415 5 213 7 1 910 811 31412
511 9 2 4 312151416 613 7 110 8
1012 3 8 1 61114 4 5 7 216 91513
13 1 714 8 91016 3111215 2 4 5 6
2 71316 6 4 51211 8 9141015 3 1
6 411 914 7 3 210 1151213 816 5
141510 11113 9 8 5 416 312 2 6 7
3 5 812101516 1 7 213 61411 4 9
7 2 5 313121411 610 1 91516 8 4
91014 6 7 8 1 41615 2 5 3121311
813 1 4 216151012 7 311 5 6 914
15161211 3 5 6 9 81314 4 110 7 2
$ gprof -p -b ./Sudoku16 gmon.out > 16x16.flt
$ gprof -q -b ./Sudoku16 gmon.out > 16x16.clg
$ g++ sudokuC25.cpp -std=c++0x -pg -o Sudoku25
$ ./Sudoku25
111 42025241915171021 8181422 612 9 316 2 71323 5
5 2192324 82212 9 316 6 7201718212514131011 4 115
1714 9 6 32521 5 7201110 2 113 4 82423151812162219
16 721 818 4 2131123 51915241210201722 1 9 62514 3
101315122214 118 61623 925 4 3 7 51911 2 824202117
12 11110 6 513232415 716 817212519 3 4 92214 22018
8191321 916 42512 215 3 511201417231822 110 724 6
4171418 7 9 322211925 124 223 5132010 61615 81112
22 3241523182011 1 71013 4 61416 21221 8 5191725 9
2016 225 510 8 61417 922121819 11115 724 3232113 4
1325 3 510 22314 418221517192420 7 1 9211216 6 811
1423 124121916 815 6 2 7202510 3 413171121 9 51822
7 818111720242122 9 3 4 11216 2 61419 52513151023
22216 9211711 71025 8 51413 6122418152319 4 1 320
6152019 41312 3 5 118112321 9 822162510 71724 214
211812 216 71019 313 12422 9 41115 6201417 823 525
924 813 1 625 420121714 3 718231622 51911211015 2
231022 71521 5 91814 62016 81117 1 21325 4 3191224
25 5 61411 117 2 8241321192315 9 31012 420182216 7
32017 4192215162311122510 5 22118 824 7 6 114 913
19 62322 81518 125 414 2 9 3 7131011162024 5121721
15 4 51714 3 72419 8202311102522 921 11213 218 616
1112 7162023 617 22124181315 11925 5 8 31422 9 410
18 925 1 21114101322 4122116 52423 7 6171520 319 8
242110 31312 92016 51917 622 81514 4 218232511 7 1
$ gprof -p -b ./Sudoku25 gmon.out > 25x25.flt
$ gprof -q -b ./Sudoku25 gmon.out > 25x25.clg
0.00 0.00 0.00 1 0.00 0.00 SolveSudoku(int (*) [9])
0.00 0.00 0.00 1 0.00 0.00 printGrid(int (*) [9])
Call graph
granularity: each sample hit covers 2 byte(s) no time propagated
index % time self children called name
0.00 0.00 6732/6732 SolveSudoku(int (*) [9]) [13]
[8] 0.0 0.00 0.00 6732 isSafe(int (*) [9], int, int, int) [8]
0.00 0.00 6732/6732 UsedInRow(int (*) [9], int, int) [9]
0.00 0.00 2185/2185 UsedInCol(int (*) [9], int, int) [10]
0.00 0.00 1078/1078 UsedInBox(int (*) [9], int, int, int) [11]
-----------------------------------------------
0.00 0.00 6732/6732 isSafe(int (*) [9], int, int, int) [8]
[9] 0.0 0.00 0.00 6732 UsedInRow(int (*) [9], int, int) [9]
-----------------------------------------------
0.00 0.00 2185/2185 isSafe(int (*) [9], int, int, int) [8]
[10] 0.0 0.00 0.00 2185 UsedInCol(int (*) [9], int, int) [10]
-----------------------------------------------
0.00 0.00 1078/1078 isSafe(int (*) [9], int, int, int) [8]
[11] 0.0 0.00 0.00 1078 UsedInBox(int (*) [9], int, int, int) [11]
-----------------------------------------------
0.00 0.00 770/770 SolveSudoku(int (*) [9]) [13]
[12] 0.0 0.00 0.00 770 FindUnassignedLocation(int (*) [9], int&, int&) [12]
-----------------------------------------------
769 SolveSudoku(int (*) [9]) [13]
0.00 0.00 1/1 main [6]
[13] 0.0 0.00 0.00 1+769 SolveSudoku(int (*) [9]) [13]
0.00 0.00 6732/6732 isSafe(int (*) [9], int, int, int) [8]
0.00 0.00 770/770 FindUnassignedLocation(int (*) [9], int&, int&) [12]
769 SolveSudoku(int (*) [9]) [13]
-----------------------------------------------
0.00 0.00 1/1 main [6]
[14] 0.0 0.00 0.00 1 printGrid(int (*) [9]) [14]
-----------------------------------------------
Index by function name
[13] SolveSudoku(int (*) [9]) [11] UsedInBox(int (*) [9], int, int, int) [14] printGrid(int (*) [9])
[12] FindUnassignedLocation(int (*) [9], int&, int&) [10] UsedInCol(int (*) [9], int, int)
[8] isSafe(int (*) [9], int, int, int) [9] UsedInRow(int (*) [9], int, int)
For 16x16 Sudoku Puzzle (4x4 squares)
0.01 38.40 0.01 frame_dummy
0.00 38.40 0.00 1 0.00 0.00 printGrid(int (*) [16])
Call graph
granularity: each sample hit covers 2 byte(s) for 0.03% of 36.85 seconds
index % time self children called name
<spontaneous>
[1] 100.0 0.00 36.85 main [1]
1.93 34.93 1/1 SolveSudoku(int (*) [16]) [2]
0.00 0.00 1/1 printGrid(int (*) [16]) [14]
-----------------------------------------------
28071635 SolveSudoku(int (*) [16]) [2]
1.93 34.93 1/1 main [1]
[2] 100.0 1.93 34.93 1+28071635 SolveSudoku(int (*) [16]) [2]
1.69 19.09 449145092/449145092 isSafe(int (*) [16], int, int, int) [3]
14.14 0.00 28071636/28071636 FindUnassignedLocation(int (*) [16], int&, int&) [4]
28071635 SolveSudoku(int (*) [16]) [2]
-----------------------------------------------
1.69 19.09 449145092/449145092 SolveSudoku(int (*) [16]) [2]
[3] 56.4 1.69 19.09 449145092 isSafe(int (*) [16], int, int, int) [3]
13.58 0.00 449145092/449145092 UsedInRow(int (*) [16], int, int) [5]
3.54 0.00 120354547/120354547 UsedInCol(int (*) [16], int, int) [6]
1.98 0.00 41212484/41212484 UsedInBox(int (*) [16], int, int, int) [7]
-----------------------------------------------
14.14 0.00 28071636/28071636 SolveSudoku(int (*) [16]) [2]
[4] 38.4 14.14 0.00 28071636 FindUnassignedLocation(int (*) [16], int&, int&) [4]
-----------------------------------------------
13.58 0.00 449145092/449145092 isSafe(int (*) [16], int, int, int) [3]
[5] 36.8 13.58 0.00 449145092 UsedInRow(int (*) [16], int, int) [5]
-----------------------------------------------
3.54 0.00 120354547/120354547 isSafe(int (*) [16], int, int, int) [3]
[6] 9.6 3.54 0.00 120354547 UsedInCol(int (*) [16], int, int) [6]
-----------------------------------------------
1.98 0.00 41212484/41212484 isSafe(int (*) [16], int, int, int) [3]
[7] 5.4 1.98 0.00 41212484 UsedInBox(int (*) [16], int, int, int) [7]
-----------------------------------------------
0.00 0.00 1/1 main [1]
[14] 0.0 0.00 0.00 1 printGrid(int (*) [16]) [14]
-----------------------------------------------
Index by function name
[2] SolveSudoku(int (*) [16]) [7] UsedInBox(int (*) [16], int, int, int) [14] printGrid(int (*) [16])
[4] FindUnassignedLocation(int (*) [16], int&, int&) [6] UsedInCol(int (*) [16], int, int)
[3] isSafe(int (*) [16], int, int, int) [5] UsedInRow(int (*) [16], int, int)
For 25x25 Sudoku Puzzle (5x5 squares)
0.00 2167.56 0.00 1 0.00 0.00 printGrid(int (*) [25])
Call graph granularity: each sample hit covers 2 byte(s) for 0.00% of 2085.44 seconds index % time self children called name <spontaneous> [1] 100.0 0.00 2085.30 main [1] 97.03 1988.27 1/1 SolveSudoku(int (*) [25]) [2] 0.00 0.00 1/1 printGrid(int (*) [25]) [14] ----------------------------------------------- 876012757 SolveSudoku(int (*) [25]) [2] 97.03 1988.27 1/1 main [1] [2] 100.0 97.03 1988.27 1+876012757 SolveSudoku(int (*) [25]) [2] 101.19 1361.55 425478951/425478951 isSafe(int (*) [25], int, int, int) [3] 525.53 0.00 876012758/876012758 FindUnassignedLocation(int (*) [25], int&, int&) [5] 876012757 SolveSudoku(int (*) [25]) [2] ----------------------------------------------- 101.19 1361.55 425478951/425478951 SolveSudoku(int (*) [25]) [2] [3] 70.1 101.19 1361.55 425478951 isSafe(int (*) [25], int, int, int) [3] 1011.03 0.00 425478951/425478951 UsedInRow(int (*) [25], int, int) [4] 259.56 0.00 590817023/590817023 UsedInCol(int (*) [25], int, int) [6] 90.96 0.00 1355081265/1355081265 UsedInBox(int (*) [25], int, int, int) [7] ----------------------------------------------- 1011.03 0.00 425478951/425478951 isSafe(int (*) [25], int, int, int) [3] [4] 48.5 1011.03 0.00 425478951 UsedInRow(int (*) [25], int, int) [4] ----------------------------------------------- 525.53 0.00 876012758/876012758 SolveSudoku(int (*) [25]) [2] [5] 25.2 525.53 0.00 876012758 FindUnassignedLocation(int (*) [25], int&, int&) [5] ----------------------------------------------- 259.56 0.00 590817023/590817023 isSafe(int (*) [25], int, int, int) [3] [6] 12.4 259.56 0.00 590817023 UsedInCol(int (*) [25], int, int) [6] ----------------------------------------------- 90.96 0.00 1355081265/1355081265 isSafe(int (*) [25], int, int, int) [3] [7] 4.4 90.96 0.00 1355081265 UsedInBox(int (*) [25], int, int, int) [7] ----------------------------------------------- <spontaneous> [8] 0.0 0.14 0.00 frame_dummy [8] ----------------------------------------------- 0.00 0.00 1/1 main [1] [14] 0.0 0.00 0.00 1 printGrid(int (*) [25]) [14] ----------------------------------------------- Index by function name [2] SolveSudoku(int (*) [25]) [7] UsedInBox(int (*) [25], int, int, int) [14] printGrid(int (*) [25]) [5] FindUnassignedLocation(int (*) [25], int&, int&) [6] UsedInCol(int (*) [25], int, int) [8] frame_dummy [3] isSafe(int (*) [25], int, int, int) [4] UsedInRow(int (*) [25], int, int) === Assignment 2 1: EasyBMP ===
EasyBMP Bitmap image library
Library: http://easybmp.sourceforge.net/
{| class="wikitable mw-collapsible mw-collapsed"! Sample code:|-| /**/ #include "EasyBMP.h" using namespace std; int main(int argc, char* argv[]){ // Create a new Bitmap image with EasyBMP BMP Background; Background.ReadFromFile(argv[1]); BMP Output; int picWidth = Background.TellWidth(); int picHeight = Background.TellHeight(); Output.SetSize(Background.TellWidth(), Background.TellHeight()); Output.SetBitDepth(1); for (int i = 1; i < picWidth - 1; ++i) { for (int j = 1; j < picHeight - 1; ++j) { int col = (Background(i, j)->Blue + Background(i, j)->Green + 10 * Background(i, j)->Red) / 12; if (col > 127) { Output(i, j)->Red = 255; Output(i, j)->Blue = 255; Output(i, j)->Green = 255; } else {
Output(i, j)->Red = 0;
Output(i, j)->Blue = 0;
Output(i, j)->Green = 0;
}
} } /* Solves the Sudoku, with best values */ void SolveSudoku(int grid[N][N], int* d_a, int* d_results) { dim3 block(BOXWIDTH, BOXWIDTH); int lastIdx(-1), nextIdx(-1); do { lastIdx = nextIdx; educatedGuess << <block, block >> > (d_a, d_results); cudaMemcpy(&nextIdx, d_results, sizeof(int), cudaMemcpyDeviceToHost); } while (lastIdx != nextIdx); } /* A utility function to print grid */ void printGrid(int grid[N][N]) { for (int row = 0; row < N; row++) { Output.WriteToFilefor (int col = 0; col < N; col++) printf("%3d", grid[row][col]); printf("\n"); } } /* Driver Program to test above functions */ int main(argv) { /* 0 means unassigned cells * int grid[N][N] = { {3, 0, 6, 5, 0, 8, 4, 0, 0}, {5, 2, 0, 0, 0, 0, 0, 0, 0}, {0, 8, 7, 0, 0, 0, 0, 3, 1}, {0, 0, 3, 0, 1, 0, 0, 8, 0}, {9, 0, 0, 8, 6, 3, 0, 0, 5}, {0, 5, 0, 0, 9, 0, 6, 0, 0}, {1, 3, 0, 0, 0, 0, 2, 5, 0}, {0, 0, 0, 0, 0, 0, 0, 7, 4}, {0, 0, 5, 2, 0, 6, 3, 0, 0} }; /** int grid[N][N]= {{0, 8, 0, 0, 0, 0, 0, 3, 0, 0, 0, 10, 9, 7, 11, 0}, {0, 9, 15, 13, 0, 10, 0, 0, 2, 6, 8, 16, 0, 0, 0, 0}, {0, 0, 16, 0, 15, 0, 8, 0, 9, 0, 0, 0, 6, 0, 2, 0}, {1, 0, 2, 0, 9, 11, 4, 6, 15, 3, 5, 7, 0, 0, 12, 0}, {16, 6, 4, 0, 5, 2, 0, 0, 1, 0, 0, 0, 11, 0, 0, 12}, {5, 11, 0, 0, 0, 3, 0, 15, 0, 16, 0, 13, 0, 1, 0, 8}, {0, 0, 3, 0, 0, 6, 11, 14, 0, 5, 7, 0, 0, 9, 0, 0}, {0, 0, 0, 14, 8, 0, 10, 0, 0, 11, 12, 0, 0, 0, 0, 0}, {0, 7, 13, 0, 0, 0, 0, 12, 0, 8, 9, 0, 0, 0, 3, 0}, {0, 0, 11, 9, 0, 7, 0, 0, 0, 0, 0, 12, 0, 8, 16, 5}, {0, 0, 10, 0, 11, 13, 0, 0, 0, 0, 0, 3, 12, 0, 6, 0}, {0, 5, 0, 0, 10, 15, 0, 1, 7, 2, 0, 0, 14, 11, 0, 0}, {0, 0, 5, 0, 0, 12, 14, 0, 0, 10, 0, 0, 15, 0, 0, 4}, {9, 0, 14, 6, 0, 0, 1, 0, 16, 0, 2, 0, 3, 0, 13, 0}, {8, 13, 0, 4, 0, 0, 0, 0, 12, 7, 3, 0, 0, 6, 0, 0}, {0, 16, 12, 0, 0, 5, 0, 9, 0, 13, 14, 4, 1, 0, 0, 0} }; /**/ int grid[N][N] = { {1, 0, 4, 0, 25, 0, 19, 0, 0, 10, 21, 8, 0, 14, 0, 6, 12, 9, 0, 0, 0, 0, 0, 0, 5},{5, 0, 19, 23, 24, 0, 22, 12, 0, 0, 16, 6, 0, 20, 0, 18, 0, 25, 14, 13, 10, 11, 0, 1, 15},{0, 0, 0, 0, 0, 0, 21, 5, 0, 20, 11, 10, 0, 1, 0, 4, 8, 24, 23, 15, 18, 0, 16, 22, 19}, {0, 7, 21, 8, 18, 0, 0, 0, 11, 0, 5, 0, 0, 24, 0, 0, 0, 17, 22, 1, 9, 6, 25, 0, 0}, {0, 13, 15, 0, 22, 14, 0, 18, 0, 16, 0, 0, 0, 4, 0, 0, 0, 19, 0, 0, 0, 24, 20, 21, 17}, {12, 0, 11, 0, 6, 0, 0, 0, 0, 15, 0, 0, 0, 0, 21, 25, 19, 0, 4, 0, 22, 14, 0, 20, 0}, {8, 0, 0, 21, 0, 16, 0, 0, 0, 2, 0, 3, 0, 0, 0, 0, 17, 23, 18, 22, 0, 0, 0, 24, 6}, {4, 0, 14, 18, 7, 9, 0, 22, 21, 19, 0, 0, 0, 2, 0, 5, 0, 0, 0, 6, 16, 15, 0, 11, 12}, {22, 0, 24, 0, 23, 0, 0, 11, 0, 7, 0, 0, 4, 0, 14, 0, 2, 12, 0, 8, 5, 19, 0, 25, 9}, {20, 0, 0, 0, 5, 0, 0, 0, 0, 17, 9, 0, 12, 18, 0, 1, 0, 0, 7, 24, 0, 0, 0, 13, 4}, {13, 0, 0, 5, 0, 2, 23, 14, 4, 18, 22, 0, 17, 0, 0, 20, 0, 1, 9, 21, 12, 0, 0, 8, 11}, {14, 23, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 20, 25, 0, 3, 4, 13, 0, 11, 21, 9, 5, 18, 22}, {7, 0, 0, 11, 17, 20, 24, 0, 0, 0, 3, 4, 1, 12, 0, 0, 6, 14, 0, 5, 25, 13, 0, 0, 0}, {0, 0, 16, 9, 0, 17, 11, 7, 10, 25, 0, 0, 0, 13, 6, 0, 0, 18, 0, 0, 19, 4, 0, 0, 20}, {6, 15, 0, 19, 4, 13, 0, 0, 5, 0, 18, 11, 0, 0, 9, 8, 22, 16, 25, 10, 7, 0, 0, 0, 0}, {0, 0, 0, 2, 0, 0, 10, 19, 3, 0, 1, 0, 22, 9, 4, 11, 15, 0, 20, 0, 0, 8, 23, 0, 25}, {0, 24, 8, 13, 1, 0, 0, 4, 20, 0, 17, 14, 0, 0, 18, 0, 16, 22, 5, 0, 11, 0, 10, 0, 0}, {23, 10, 0, 0, 0, 0, 0, 0, 18, 0, 6, 0, 16, 0, 0, 17, 1, 0, 13, 0, 0, 3, 19, 12, 0}, {25, 5, 0, 14, 11, 0, 17, 0, 8, 24, 13, 0, 19, 23, 15, 9, 0, 0, 12, 0, 20, 0, 22, 0, 7}, {0, 0, 17, 4, 0, 22, 15, 0, 23, 11, 12, 25, 0, 0, 0, 0, 18, 8, 0, 7, 0, 0, 14, 0, 13}, {19, 6, 23, 22, 8, 0, 0, 1, 25, 4, 14, 2, 0, 3, 7, 13, 10, 11, 16, 0, 0, 0, 0, 0, 0}, {0, 4, 0, 17, 0, 3, 0, 24, 0, 8, 20, 23, 11, 10, 25, 22, 0, 0, 0, 12, 13, 2, 18, 6, 0}, {0, 0, 7, 16, 0, 0, 6, 17, 2, 21, 0, 18, 0, 0, 0, 19, 0, 0, 8, 0, 0, 0, 0, 4, 0}, {18, 9, 25, 1, 2, 11, 0, 0, 13, 22, 4, 0, 21, 0, 5, 0, 23, 7, 0, 0, 15, 0, 3, 0, 8}, {0, 21, 10, 0, 0, 12, 0, 20, 16, 0, 19, 0, 0, 0, 0, 15, 14, 4, 2, 18, 23, 25, 11, 7, 0} }; /**/ int* d_a; //Table int* d_result; //Table change indicator cudaMalloc((void**)&d_a, N*N * sizeof(int)); cudaMalloc((void**)&d_result, sizeof(int)); //Copy Sudoku over cudaMemcpy(d_a, grid, N*N * sizeof(int), cudaMemcpyHostToDevice); SolveSudoku(grid, d_a, d_result); //Copy Sudoku back cudaMemcpy(grid, d_a, N*N * sizeof(int), cudaMemcpyDeviceToHost); printGrid(grid); cudaFree(d_a); cudaFree(d_result); return 0;
}
__global__ void superSolve(int * d_a) { //Used to remember which row | col | box ( section ) have which values __shared__ bool rowHas[N][N]; __shared__ bool colHas[N][N]; __shared__ bool boxHas[N][N]; //Used to ensure that the table has changed __shared__ int added, past; //Number of spaces which can place the number in each section __shared__ int rowCount[N][N]; __shared__ int colCount[N][N]; __shared__ int boxCount[N][N]; //Where the square is located in the Sudoku int row = threadIdx.x; int col = threadIdx.y; int box = row / BOXWIDTH + (col / BOXWIDTH) * BOXWIDTH; //Unique identifier for each square in row, col, box //Corresponds to the generic Sudoku Solve //Using a Sudoku to solve a Sudoku !!! int offset = col + (row % BOXWIDTH) * BOXWIDTH + (box % BOXWIDTH); //Square's location in the Sudoku int gridIdx = col * N + row; int at = d_a[gridIdx]; if (!gridIdx) { //Thread at 0,0 sets values added = -1; past = -2; } rowHas[col][row] = false; colHas[col][row] = false; boxHas[col][row] = false; __syncthreads(); if (at != UNASSIGNED) { rowHas[row][at - 1] = true; colHas[col][at - 1] = true; boxHas[box][at - 1] = true; } //Previous loop has not changed any values while (added != past) { //RESET counters rowCount[col][row] = 0; colCount[col][row] = 0; boxCount[col][row] = 0; __syncthreads(); if (!gridIdx) //forget previous change past = added; int count = 0; //number of values which can fit in this square int guess = at; //last value found which can fit in this square for (int idx = 0; idx < N; ++idx) { //Ensures that every square in each section is working on a different number in the section int num = (idx + offset) % N; if (at == UNASSIGNED && !(rowHas[row][num] || colHas[col][num] || boxHas[box][num])) { count++; guess = num + 1; rowCount[row][num] ++; colCount[col][num] ++; boxCount[box][num] ++; } __syncthreads(); } //Only ONE value can fit in this spot if (count == 1) { at = guess--; d_a[gridIdx] = at; rowHas[row][guess] = true; colHas[col][guess] = true; boxHas[box][guess] = true; added = gridIdx; } __syncthreads(); if (at == UNASSIGNED) { //Find values which can go in only one spot in the section for (int idx = 0; idx < N; ++idx) { if (!(rowHas[row][idx] || colHas[col][idx] || boxHas[box][idx]) && (boxCount[box][idx] == 1 || rowCount[row][idx] == 1 || colCount[col][idx] == 1)) { //In progressthis section this value can only appear in this square at = idx + 1; d_a[gridIdx] = at; rowHas[row][idx] = true; colHas[col][idx] = true; boxHas[box][idx] = true; added = gridIdx; } } } __syncthreads(); } }[[File:Backtrack_vs_Kernel.png]]
=== Assignment 3 ===
Changes:
Reduced Thread Divergence/CGMA
-each thread now remembers which values it has seen in a boolean array
- values are only assigned to the grid after the kernel 'solves' the sudoku
- at value in kernel and shared memory for rowHas, colHas, boxHas, updated in a single place
Coalesced Memory
- change modifying _Has and _Count arrays from row->col to col->row as row(threadIdx.x) is our fastest moving dimension
Clarified Code
- use gridIdx == 0 rather then !gridIdx
- use a do-while loop rather then a while loop
{| class="wikitable mw-collapsible mw-collapsed"
! Full code
|-
|
#include <stdio.h>
// CUDA header file
#include "cuda_runtime.h"
#include <device_launch_parameters.h>
#ifndef __CUDACC__
#define __CUDACC__
#endif
// UNASSIGNED is used for empty cells in Sudoku grid
#define UNASSIGNED 0
// BOX_W is used for the length of one of the square sub-regions of the Sudoku grid.
// Overall length will be N * N.
#define BOX_W 5
#define N (BOX_W * BOX_W)
__global__ void solve(int* d_a) {
// Used to remember which row | col | box ( section ) have which values
__shared__ bool rowHas[N][N];
__shared__ bool colHas[N][N];
__shared__ bool boxHas[N][N];
// Used to ensure that the table has changed
__shared__ bool changed;
// Number of spaces which can place the number in each section
__shared__ int rowCount[N][N];
__shared__ int colCount[N][N];
__shared__ int boxCount[N][N];
// Where the square is located in the Sudoku
int row = threadIdx.x;
int col = threadIdx.y;
int box = row / BOX_W + (col / BOX_W) * BOX_W;
// Unique identifier for each square in row, col, box
// Corresponds to the generic Sudoku Solve
// Using a Sudoku to solve a Sudoku !!!
int offset = col + (row % BOX_W) * BOX_W + (box % BOX_W);
// Square's location in the Sudoku
int gridIdx = col * N + row;
int at = d_a[gridIdx];
bool notSeen[N];
for (int i = 0; i < N; ++i)
notSeen[i] = true;
rowHas[col][row] = false;
colHas[col][row] = false;
boxHas[col][row] = false;
__syncthreads();
if (at != UNASSIGNED) {
rowHas[row][at - 1] = true;
colHas[col][at - 1] = true;
boxHas[box][at - 1] = true;
}
// Previous loop has not changed any values
do {
// RESET counters
rowCount[col][row] = 0;
colCount[col][row] = 0;
boxCount[col][row] = 0;
__syncthreads();
if (gridIdx == 0) // forget previous change
changed = false;
int count = 0; // number of values which can fit in this square
int guess = 0; // last value found which can fit in this square
for (int idx = 0; idx < N; ++idx) {
// Ensures that every square in each section is working on a different number in the section
int num = (idx + offset) % N;
if (at == UNASSIGNED && notSeen[num]) {
if (rowHas[row][num] || boxHas[box][num] || colHas[col][num])
notSeen[num] = false;
else {
++count;
guess = num;
rowCount[row][num]++;
colCount[col][num]++;
boxCount[box][num]++;
}
}
__syncthreads();
}
// Find values which can go in only one spot in the section
for (int idx = 0; idx < N && count > 1; ++idx) {
if (notSeen[idx] &&
(rowCount[row][idx] == 1 || boxCount[box][idx] == 1 || colCount[col][idx] == 1)) {
// In this section this value can only appear in this square
guess = idx;
count = 1;
}
}
if (count == 1) {
at = guess + 1;
rowHas[row][guess] = true;
colHas[col][guess] = true;
boxHas[box][guess] = true;
changed = true;
}
__syncthreads();
} while (changed);
//SOLVED CHECK
if (!(rowHas[row][col] || colHas[row][col] || boxHas[row][col]))
changed = true;
__syncthreads();
if (changed && gridIdx == 0)
at = 0;
d_a[gridIdx] = at;
}
void print(int result[N][N]) {
for (int row = 0; row < N; row++) {
for (int col = 0; col < N; col++)
printf("%3d", result[row][col]);
printf("\n");
}
}
// Driver program to test main program functions
int main() {
int h_a[N][N] = {
{ 1, 0, 4, 0, 25, 0, 19, 0, 0, 10, 21, 8, 0, 14, 0, 6, 12, 9, 0, 0, 0, 0, 0, 0, 5},
{ 5, 0, 19, 23, 24, 0, 22, 12, 0, 0, 16, 6, 0, 20, 0, 18, 0, 25, 14, 13, 10, 11, 0, 1, 15},
{ 0, 0, 0, 0, 0, 0, 21, 5, 0, 20, 11, 10, 0, 1, 0, 4, 8, 24, 23, 15, 18, 0, 16, 22, 19},
{ 0, 7, 21, 8, 18, 0, 0, 0, 11, 0, 5, 0, 0, 24, 0, 0, 0, 17, 22, 1, 9, 6, 25, 0, 0},
{ 0, 13, 15, 0, 22, 14, 0, 18, 0, 16, 0, 0, 0, 4, 0, 0, 0, 19, 0, 0, 0, 24, 20, 21, 17},
{ 12, 0, 11, 0, 6, 0, 0, 0, 0, 15, 0, 0, 0, 0, 21, 25, 19, 0, 4, 0, 22, 14, 0, 20, 0},
{ 8, 0, 0, 21, 0, 16, 0, 0, 0, 2, 0, 3, 0, 0, 0, 0, 17, 23, 18, 22, 0, 0, 0, 24, 6},
{ 4, 0, 14, 18, 7, 9, 0, 22, 21, 19, 0, 0, 0, 2, 0, 5, 0, 0, 0, 6, 16, 15, 0, 11, 12},
{ 22, 0, 24, 0, 23, 0, 0, 11, 0, 7, 0, 0, 4, 0, 14, 0, 2, 12, 0, 8, 5, 19, 0, 25, 9},
{ 20, 0, 0, 0, 5, 0, 0, 0, 0, 17, 9, 0, 12, 18, 0, 1, 0, 0, 7, 24, 0, 0, 0, 13, 4},
{ 13, 0, 0, 5, 0, 2, 23, 14, 4, 18, 22, 0, 17, 0, 0, 20, 0, 1, 9, 21, 12, 0, 0, 8, 11},
{ 14, 23, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 20, 25, 0, 3, 4, 13, 0, 11, 21, 9, 5, 18, 22},
{ 7, 0, 0, 11, 17, 20, 24, 0, 0, 0, 3, 4, 1, 12, 0, 0, 6, 14, 0, 5, 25, 13, 0, 0, 0},
{ 0, 0, 16, 9, 0, 17, 11, 7, 10, 25, 0, 0, 0, 13, 6, 0, 0, 18, 0, 0, 19, 4, 0, 0, 20},
{ 6, 15, 0, 19, 4, 13, 0, 0, 5, 0, 18, 11, 0, 0, 9, 8, 22, 16, 25, 10, 7, 0, 0, 0, 0},
{ 0, 0, 0, 2, 0, 0, 10, 19, 3, 0, 1, 0, 22, 9, 4, 11, 15, 0, 20, 0, 0, 8, 23, 0, 25},
{ 0, 24, 8, 13, 1, 0, 0, 4, 20, 0, 17, 14, 0, 0, 18, 0, 16, 22, 5, 0, 11, 0, 10, 0, 0},
{ 23, 10, 0, 0, 0, 0, 0, 0, 18, 0, 6, 0, 16, 0, 0, 17, 1, 0, 13, 0, 0, 3, 19, 12, 0},
{ 25, 5, 0, 14, 11, 0, 17, 0, 8, 24, 13, 0, 19, 23, 15, 9, 0, 0, 12, 0, 20, 0, 22, 0, 7},
{ 0, 0, 17, 4, 0, 22, 15, 0, 23, 11, 12, 25, 0, 0, 0, 0, 18, 8, 0, 7, 0, 0, 14, 0, 13},
{ 19, 6, 23, 22, 8, 0, 0, 1, 25, 4, 14, 2, 0, 3, 7, 13, 10, 11, 16, 0, 0, 0, 0, 0, 0},
{ 0, 4, 0, 17, 0, 3, 0, 24, 0, 8, 20, 23, 11, 10, 25, 22, 0, 0, 0, 12, 13, 2, 18, 6, 0},
{ 0, 0, 7, 16, 0, 0, 6, 17, 2, 21, 0, 18, 0, 0, 0, 19, 0, 0, 8, 0, 0, 0, 0, 4, 0},
{ 18, 9, 25, 1, 2, 11, 0, 0, 13, 22, 4, 0, 21, 0, 5, 0, 23, 7, 0, 0, 15, 0, 3, 0, 8},
{ 0, 21, 10, 0, 0, 12, 0, 20, 16, 0, 19, 0, 0, 0, 0, 15, 14, 4, 2, 18, 23, 25, 11, 7, 0}
};
int* d_a; //Table
cudaMalloc((void**)&d_a, N * N * sizeof(int));
// Copy Sudoku to device
cudaMemcpy(d_a, h_a, N * N * sizeof(int), cudaMemcpyHostToDevice);
dim3 dBlock(N, N);
solve << <1, dBlock >> > (d_a);
// Copy Sudoku back to host
cudaMemcpy(h_a, d_a, N * N * sizeof(int), cudaMemcpyDeviceToHost);
// Check if solved
if (h_a[0][0])
print(h_a);
else
printf("No solution could be found.");
cudaFree(d_a);
return 0;
}
|}
[[File:Unoptimized_vs_Optimized.png]]
===Kernel Optimization Attempts===
These Kernels change a minor part of the Optimized Kernel or use a slightly different algorithm in an attempt to make it faster
Change : Replaces the boolean array hasSeen with a single int & uses bitwise operators
Theory : Since local array variables of threads are stored in Global memory this was an attempt to move that into a register
Result : No speed up noticed, suggesting that more is happening beyond arrays stored in Global memory, perhaps some type of paging,
more testing would be needed on something less erratic then a Sudoku Solver
{| class="wikitable mw-collapsible mw-collapsed"
! Using a int as a boolean array
|-
|
__global__ void solve(int* d_a) {
// Used to remember which row | col | box ( section ) have which values
__shared__ bool rowHas[N][N];
__shared__ bool colHas[N][N];
__shared__ bool boxHas[N][N];
// Used to ensure that the table has changed
__shared__ bool changed;
// Number of spaces which can place the number in each section
__shared__ int rowCount[N][N];
__shared__ int colCount[N][N];
__shared__ int boxCount[N][N];
// Where the square is located in the Sudoku
int row = threadIdx.x;
int col = threadIdx.y;
int box = row / BOX_W + (col / BOX_W) * BOX_W;
int gridIdx = col * N + row;
int at = d_a[gridIdx];
// Unique identifier for each square in row, col, box
// Corresponds to the generic Sudoku Solve
// Using a Sudoku to solve a Sudoku !!!
int offset = col + (row % BOX_W) * BOX_W + (box % BOX_W);
// Square's location in the Sudoku
int notSeen = 0;
rowHas[col][row] = false;
colHas[col][row] = false;
boxHas[col][row] = false;
__syncthreads();
if (at != UNASSIGNED) {
rowHas[row][at - 1] = true;
colHas[col][at - 1] = true;
boxHas[box][at - 1] = true;
} else {
notSeen = ~0;
}
__syncthreads();
// Previous loop has not changed any values
do {
// RESET counters
rowCount[col][row] = 0;
colCount[col][row] = 0;
boxCount[col][row] = 0;
__syncthreads();
if (gridIdx == 0) // forget previous change
changed = false;
int count = 0; // number of values which can fit in this square
int guess = 0; // last value found which can fit in this square
int b_shuttle = 1;
for (int idx = 0; idx < N; ++idx) {
// Ensures that every square in each section is working on a different number in the section
int num = (idx + offset) % N;
if (b_shuttle & notSeen) {
if (rowHas[row][num] || boxHas[box][num] || colHas[col][num])
notSeen ^= b_shuttle;
else {
++count;
guess = num;
rowCount[row][num]++;
colCount[col][num]++;
boxCount[box][num]++;
}
}
b_shuttle <<= 1;
__syncthreads();
}
// Find values which can go in only one spot in the section
b_shuttle = 1;
for (int idx = 0; idx < N && count > 1; ++idx) {
int num = (idx + offset) % N;
if ((b_shuttle & notSeen) &&
(rowCount[row][num] == 1 || boxCount[box][num] == 1 || colCount[col][num] == 1)) {
// In this section this value can only appear in this square
guess = num;
count = 1;
}
b_shuttle <<= 1;
}
if (count == 1) {
at = guess + 1;
notSeen = 0;
rowHas[row][guess] = true;
colHas[col][guess] = true;
boxHas[box][guess] = true;
changed = true;
}
__syncthreads();
} while (changed);
//SOLVED CHECK
if (!(rowHas[row][col] || colHas[row][col] || boxHas[row][col]))
changed = true;
__syncthreads();
if (changed && gridIdx == 0)
at = 0;
d_a[gridIdx] = at;
}
|}
Change : Remove the counters, and logic which checks for a section needing a value in one place
Theory : The counting logic requires a additional nested loop each solve cycle and created more thread divergence
Result : The algorithm is slower, probably because 'sections requiring a single value' adds more values early in the kernel resulting in less passes overall
Also this kernel is similar to one of my earlier builds, which was unable to solve the 9x9 getting stuck on every square having more then one possible value
{| class="wikitable mw-collapsible mw-collapsed"
! Dropping Section Logic
|-
|
__global__ void solve(int* d_a) {
// Used to remember which row | col | box ( section ) have which values
__shared__ bool rowHas[N][N];
__shared__ bool colHas[N][N];
__shared__ bool boxHas[N][N];
// Used to ensure that the table has changed
__shared__ bool changed;
// Number of spaces which can place the number in each section
// Where the square is located in the Sudoku
int row = threadIdx.x;
int col = threadIdx.y;
int box = row / BOX_W + (col / BOX_W) * BOX_W;
// Unique identifier for each square in row, col, box
// Corresponds to the generic Sudoku Solve
// Using a Sudoku to solve a Sudoku !!!
int offset = col + (row % BOX_W) * BOX_W + (box % BOX_W);
// Square's location in the Sudoku
int gridIdx = col * N + row;
int at = d_a[gridIdx];
bool notSeen[N];
for (int i = 0; i < N; ++i)
notSeen[i] = true;
rowHas[col][row] = false;
colHas[col][row] = false;
boxHas[col][row] = false;
__syncthreads();
if (at != UNASSIGNED) {
rowHas[row][at - 1] = true;
colHas[col][at - 1] = true;
boxHas[box][at - 1] = true;
}
// Previous loop has not changed any values
do {
// RESET counters
__syncthreads();
if (gridIdx == 0) // forget previous change
changed = false;
int count = 0; // number of values which can fit in this square
int guess = 0; // last value found which can fit in this square
for (int idx = 0; idx < N; ++idx) {
// Ensures that every square in each section is working on a different number in the section
int num = (idx + offset) % N;
if (at == UNASSIGNED && notSeen[num]) {
if (rowHas[row][num] || boxHas[box][num] || colHas[col][num])
notSeen[num] = false;
else {
++count;
guess = num;
}
}
__syncthreads();
}
if (count == 1) {
at = guess + 1;
rowHas[row][guess] = true;
colHas[col][guess] = true;
boxHas[box][guess] = true;
changed = true;
}
__syncthreads();
} while (changed);
//SOLVED CHECK
if (!(rowHas[row][col] || colHas[row][col] || boxHas[row][col]))
changed = true;
__syncthreads();
if (changed && gridIdx == 0)
at = 0;
d_a[gridIdx] = at;
}
|}
Change : Quickly finds one section that requires a single value in one spot, by checking all sections at once and remembering a single section
Theory : Similar to the previous Kernel, trying to remove the second loop
Result : Surprisingly slow, gains little benefit from the section logic and shared memory, yet is still required to count all values
{| class="wikitable mw-collapsible mw-collapsed"
! Notify - Determines a single section that has a limited value (removes section loop)
|-
|
__global__ void solve(int* d_a) {
// Used to remember which row | col | box ( section ) have which values
__shared__ bool rowHas[N][N];
__shared__ bool colHas[N][N];
__shared__ bool boxHas[N][N];
// Used to ensure that the table has changed
__shared__ bool changed;
// Number of spaces which can place the number in each section
__shared__ int rowCount[N][N];
__shared__ int colCount[N][N];
__shared__ int boxCount[N][N];
// Where the square is located in the Sudoku
int row = threadIdx.x;
int col = threadIdx.y;
int box = row / BOX_W + (col / BOX_W) * BOX_W;
// Unique identifier for each square in row, col, box
// Corresponds to the generic Sudoku Solve
// Using a Sudoku to solve a Sudoku !!!
int offset = col + (row % BOX_W) * BOX_W + (box % BOX_W);
// Square's location in the Sudoku
int gridIdx = col * N + row;
int at = d_a[gridIdx];
bool notSeen[N];
for (int i = 0; i < N; ++i)
notSeen[i] = true;
rowHas[col][row] = false;
colHas[col][row] = false;
boxHas[col][row] = false;
__syncthreads();
__shared__ int notify;
if (at != UNASSIGNED) {
rowHas[row][at - 1] = true;
colHas[col][at - 1] = true;
boxHas[box][at - 1] = true;
}
// Previous loop has not changed any values
do {
// RESET counters
rowCount[col][row] = 0;
colCount[col][row] = 0;
boxCount[col][row] = 0;
__syncthreads();
if (gridIdx == 0) { // forget previous change
changed = false;
notify = -1;
}
int count = 0; // number of values which can fit in this square
int guess = 0; // last value found which can fit in this square
for (int idx = 0; idx < N; ++idx) {
// Ensures that every square in each section is working on a different number in the section
int num = (idx + offset) % N;
if (at == UNASSIGNED && notSeen[num]) {
if (rowHas[row][num] || boxHas[box][num] || colHas[col][num])
notSeen[num] = false;
else {
++count;
guess = num;
rowCount[row][num]++;
colCount[col][num]++;
boxCount[box][num]++;
}
}
__syncthreads();
}
if (rowCount[row][col] == 1 || colCount[row][col] == 1 || boxCount[row][col] == 1)
notify = col;
__syncthreads();
// Find values which can go in only one spot in the section
if (notify > 0 && at == UNASSIGNED && notSeen[notify] &&
(rowCount[row][notify] == 1 || boxCount[box][notify] == 1 || colCount[col][notify] == 1)) {
// In this section this value can only appear in this square
guess = notify;
count = 1;
}
if (count == 1) {
at = guess + 1;
rowHas[row][guess] = true;
colHas[col][guess] = true;
boxHas[box][guess] = true;
changed = true;
}
__syncthreads();
} while (changed);
//SOLVED CHECK
if (!(rowHas[row][col] || colHas[row][col] || boxHas[row][col]))
changed = true;
__syncthreads();
if (changed && gridIdx == 0)
at = 0;
d_a[gridIdx] = at;
}
|}
Change : Refactors the algorithm to count the total numbers that can fit in a square or section
Then counts down as values are added
Theory : Remove redundant counting logic that occurred during the Optimized Kernel each pass
Result : Not faster, HOWEVER there is a slight error, by setting notSeen = 0, the section counters will rarely reach one
{| class="wikitable mw-collapsible mw-collapsed"
! CountDown - using Int as Boolean Array(EDITED now 4.28 seconds)
|-
|
__global__ void solve(int* d_a) {
// Used to remember which row | col | box ( section ) have which values
__shared__ bool rowHas[N][N];
__shared__ bool colHas[N][N];
__shared__ bool boxHas[N][N];
// Used to ensure that the table has changed
__shared__ bool changed;
// Number of spaces which can place the number in each section
__shared__ int rowCount[N][N];
__shared__ int colCount[N][N];
__shared__ int boxCount[N][N];
// Where the square is located in the Sudoku
int row = threadIdx.x;
int col = threadIdx.y;
int box = row / BOX_W + (col / BOX_W) * BOX_W;
int gridIdx = col * N + row;
int at = d_a[gridIdx];
// Unique identifier for each square in row, col, box
// Corresponds to the generic Sudoku Solve
// Using a Sudoku to solve a Sudoku !!!
int offset = col + (row % BOX_W) * BOX_W + (box % BOX_W);
// Square's location in the Sudoku
int count = 0; //Number of values which can fit in this square
int notSeen = 0; //Boolean Array as an Integer
if (gridIdx == 0)
changed = true;
rowHas[col][row] = false;
colHas[col][row] = false;
boxHas[col][row] = false;
rowCount[col][row] = 0;
colCount[col][row] = 0;
boxCount[col][row] = 0;
__syncthreads();
if (at != UNASSIGNED) {
rowHas[row][at - 1] = true;
colHas[col][at - 1] = true;
boxHas[box][at - 1] = true;
}
__syncthreads();
int guess;
int b_shuttle = 1;
for (int idx = 0; idx < N; ++idx) {
int num = (idx + offset) % N;
if (at == UNASSIGNED && !(rowHas[row][num] || boxHas[box][num] || colHas[col][num])) {
notSeen |= b_shuttle; //this value can go here
++count; //how many values this square can have
guess = num;
//how many values this section can have
rowCount[row][num]++;
colCount[col][num]++;
boxCount[box][num]++;
}
__syncthreads();
b_shuttle <<= 1;
}
if (at == UNASSIGNED && count == 0) //NOT POSSIBLE SUDOKU
changed = false;
__syncthreads();
if (count == 1) {
at = guess + 1;
notSeen = count = 0;
rowHas[row][guess] = true;
colHas[col][guess] = true;
boxHas[box][guess] = true;
}
// Previous loop has not changed any values
while (changed) {
__syncthreads();
if (gridIdx == 0) // forget previous change
changed = false;
int b_shuttle = 1;
for (int idx = 0; idx < N; ++idx) {
// Ensures that every square in each section is working on a different number in the section
int num = (idx + offset) % N;
if (b_shuttle & notSeen &&
(at != UNASSIGNED || rowHas[row][num] || boxHas[box][num] || colHas[col][num])) {
rowCount[row][num]--;
colCount[col][num]--;
boxCount[box][num]--;
notSeen ^= b_shuttle;
--count;
}
__syncthreads();
if (b_shuttle & notSeen &&
(count == 1 || rowCount[row][num] == 1 || boxCount[box][num] == 1 || colCount[col][num] == 1)) {
rowHas[row][num] = true;
colHas[col][num] = true;
boxHas[box][num] = true;
changed = true;
notSeen ^= b_shuttle;
at = num + 1;
count = 0;
}
b_shuttle <<= 1;
}
__syncthreads();
};
if (!(rowHas[row][col] && colHas[row][col] && boxHas[box][col]))
changed = true; //HAVE NOT SOLVED the sudoku
__syncthreads();
if (changed && gridIdx == 0)
at = 0;
d_a[gridIdx] = at;
}
|}
Change : uses countdown logic with a boolean array
Result : Similar times to other Countdown kernel
{| class="wikitable mw-collapsible mw-collapsed"
! Countdown Boolean Array (EDITED - now 4.37ms)
|-
|
__global__ void solve(int* d_a) {
// Used to remember which row | col | box ( section ) have which values
__shared__ bool rowHas[N][N];
__shared__ bool colHas[N][N];
__shared__ bool boxHas[N][N];
// Used to ensure that the table has changed
__shared__ bool changed;
// Number of spaces which can place the number in each section
__shared__ int rowCount[N][N];
__shared__ int colCount[N][N];
__shared__ int boxCount[N][N];
// Where the square is located in the Sudoku
int row = threadIdx.x;
int col = threadIdx.y;
int box = row / BOX_W + (col / BOX_W) * BOX_W;
int gridIdx = col * N + row;
int at = d_a[gridIdx];
// Unique identifier for each square in row, col, box
// Corresponds to the generic Sudoku Solve
// Using a Sudoku to solve a Sudoku !!!
int offset = col + (row % BOX_W) * BOX_W + (box % BOX_W);
// Square's location in the Sudoku
int count = 0; //Number of values which can fit in this square
bool notSeen[N]; //Boolean Array as an Integer
for(int idx = 0; idx < N; ++idx)
notSeen[idx] = false;
if (gridIdx == 0)
changed = true;
rowHas[col][row] = false;
colHas[col][row] = false;
boxHas[col][row] = false;
rowCount[col][row] = 0;
colCount[col][row] = 0;
boxCount[col][row] = 0;
__syncthreads();
if (at != UNASSIGNED) {
rowHas[row][at - 1] = true;
colHas[col][at - 1] = true;
boxHas[box][at - 1] = true;
}
__syncthreads();
int guess;
for (int idx = 0; idx < N; ++idx) {
int num = (idx + offset) % N;
if (at == UNASSIGNED && !(rowHas[row][num] || boxHas[box][num] || colHas[col][num])) {
notSeen[num] = true; //this value can go here
++count; //how many values this square can have
guess = num;
//how many values this section can have
rowCount[row][num]++;
colCount[col][num]++;
boxCount[box][num]++;
}
__syncthreads();
}
if (at == UNASSIGNED && count == 0) //NOT POSSIBLE SUDOKU
changed = false;
__syncthreads();
if (count == 1) {
at = guess + 1;
count = 0;
notSeen[guess] = false;
rowHas[row][guess] = true;
colHas[col][guess] = true;
boxHas[box][guess] = true;
}
// Previous loop has not changed any values
while (changed) {
__syncthreads();
if (gridIdx == 0) // forget previous change
changed = false;
for (int idx = 0; idx < N; ++idx) {
// Ensures that every square in each section is working on a different number in the section
int num = (idx + offset) % N;
if (notSeen[num] &&
(at != UNASSIGNED || rowHas[row][num] || boxHas[box][num] || colHas[col][num])) {
rowCount[row][num]--;
colCount[col][num]--;
boxCount[box][num]--;
notSeen[num] = false;
--count;
}
__syncthreads();
if ( notSeen[num] &&
(count == 1 || rowCount[row][num] == 1 || boxCount[box][num] == 1 || colCount[col][num] == 1)) {
rowHas[row][num] = true;
colHas[col][num] = true;
boxHas[box][num] = true;
changed = true;
notSeen[num] = false;
at = num + 1;
count = 0;
}
}
__syncthreads();
};
if (!(rowHas[row][col] && colHas[row][col] && boxHas[box][col]))
changed = true; //HAVE NOT SOLVED the sudoku
__syncthreads();
if (changed && gridIdx == 0)
at = 0;
d_a[gridIdx] = at;
}
|}
[[File:Kernel_Compare.png]]
==== Occupancy Calculations ====
{| class="wikitable mw-collapsible mw-collapsed"
! For 9x9:
|-
|
[[File:Occupancy_9x9.png]]
|}
{| class="wikitable mw-collapsible mw-collapsed"
! For 16x16:
|-
|
[[File:Occupancy_16x16.png]]
|}
{| class="wikitable mw-collapsible mw-collapsed"
! For 25x25:
|-
|
[[File:Occupancy_25x25.png]]
|}