=== Assignment 3 ===
{| class="wikitable mw-collapsible mw-collapsed"! Optimized kernel : Single Pass|-| __global__ void superSolve(int * d_a) { //Used to remember which row | col | box ( section ) have which values __shared__ bool rowHas[N][N]; __shared__ bool colHas[N][N]; __shared__ bool boxHas[N][N]; //Used to ensure that the table has changed __shared__ bool changed; //Number of spaces which can place the number in each section __shared__ int rowCount[N][N]; __shared__ int colCount[N][N]; __shared__ int boxCount[N][N]; //Where the square is located in the Sudoku int row = threadIdx.x; int col = threadIdx.y; int box = row / BOXWIDTH + (col / BOXWIDTH) * BOXWIDTH; //Unique identifier for each square in row, col, box //Corresponds to the generic Sudoku Solve //Using a Sudoku to solve a Sudoku !!! int offset = col + (row % BOXWIDTH) * BOXWIDTH + (box % BOXWIDTH); //Square's location in the Sudoku int gridIdx = col * N + row; int at = d_a[gridIdx]; bool notSeen[N]; for (int i = 0; i < N; ++i) notSeen[i] = true; rowHas[col][row] = false; colHas[col][row] = false; boxHas[col][row] = false; __syncthreads(); if (at != UNASSIGNED) { rowHas[row][at - 1] = true; colHas[col][at - 1] = true; boxHas[box][at - 1] = true; } //Previous loop has not changed any values do { //RESET counters rowCount[col][row] = 0; colCount[col][row] = 0; boxCount[col][row] = 0; __syncthreads(); if (gridIdx == 0) //forget previous change changed = false; int count = 0; //number of values which can fit in this square int guess = 0; //last value found which can fit in this square for (int idx = 0; idx < N; ++idx) { //Ensures that every square in each section is working on a different number in the section int num = (idx + offset) % N; if (at == UNASSIGNED && notSeen[num]){ if (rowHas[row][num] || boxHas[box][num] || colHas[col][num]){ notSeen[num] = false; } else { ++count; guess = num; rowCount[row][num] ++; colCount[col][num] ++; boxCount[box][num] ++; } } __syncthreads(); } //Find values which can go in only one spot in the section for (int idx = 0; idx < N && count > 1; ++idx) { if (notSeen[idx] && ( rowCount[row][idx] == 1 || boxCount[box][idx] == 1 || colCount[col][idx] == 1)) { //In this section this value can only appear in this square guess = idx; count = 1; } } if (count == 1) { at = guess + 1; rowHas[row][guess] = true; colHas[col][guess] = true; boxHas[box][guess] = true; changed = true; } __syncthreads(); } while (changed ); d_a[gridIdx] = at; }|} Reduced superSolve runtime from 5.1 2 to 13.2ms8ms
Changes:
Used faster memoryReduced Thread Divergence/CGMA
-each thread now remembers which values it has seen in a boolean array
Reduced Thread Divergence/CGMA - values are only assigned to the grid after the kernel 'solves' the sudoku removing wait times for assigning to global memory - at value in kernel and shared memory for rowHas, colHas, boxHas, updated in a single place, reducing wait time for updating
Coalesced Memory
- change modifying _Has and _Count arrays from colrow->row col to rowcol->col row as row(threadIdx.x) is our fastest moving dimension - query in order of row->box->col for the same reason
Clarified Code
- use gridIdx == 0 rather then !gridIdx