Difference between revisions of "0xCAFEBABE"
Theodor Dule (talk | contribs) |
(→Assignment 3) |
||
(3 intermediate revisions by one other user not shown) | |||
Line 117: | Line 117: | ||
=== Assignment 2 === | === Assignment 2 === | ||
+ | |||
+ | During our initial attempt at parallelizing a Rubiks Cube solver, we came across issues with compiling with the CUDA compiler. The issue turned out to be compiling CUDA code with C. Our first attempt at resolving this issue was by adding extern "C" to all affected files. While that resolved some linking errors, we were still unable to properly link it. We decided to instead turn our attention to Theo's colour sorter that he posted for Assignment 1. We were able to successfully parallelize it with a single thread. The performance increase from serial to CUDA was extremely significant to the point to where we had to increase n to 10^10 to actually measure the difference. As we progress to part 3 of these assignments, we see a constraint of the sorter being fixed to digits from 0 to 2. We hope to remove this restriction in conjunction with optimizing it. | ||
+ | |||
=== Assignment 3 === | === Assignment 3 === | ||
+ | <source lang="cpp"> | ||
+ | // Source : https://oj.leetcode.com/problems/sort-colors/ | ||
+ | // Author : Hao Chen | ||
+ | // Date : 2014-06-25 | ||
+ | |||
+ | /********************************************************************************** | ||
+ | * | ||
+ | * Given an array with n objects colored red, white or blue, sort them so that objects of | ||
+ | * the same color are adjacent, with the colors in the order red, white and blue. | ||
+ | * | ||
+ | * Here, we will use the integers 0, 1, and 2 to represent the color red, white, and blue respectively. | ||
+ | * | ||
+ | * Note: | ||
+ | * You are not suppose to use the library's sort function for this problem. | ||
+ | * | ||
+ | * Follow up: | ||
+ | * > A rather straight forward solution is a two-pass algorithm using counting sort. | ||
+ | * > First, iterate the array counting number of 0's, 1's, and 2's, then overwrite array | ||
+ | * with total number of 0's, then 1's and followed by 2's. | ||
+ | * > Could you come up with an one-pass algorithm using only constant space? | ||
+ | * | ||
+ | **********************************************************************************/ | ||
+ | |||
+ | #include <time.h> | ||
+ | /* Added */ | ||
+ | #include <iostream> | ||
+ | #include <cuda_runtime.h> | ||
+ | #include <chrono> | ||
+ | using namespace std::chrono; | ||
+ | /* ======================================================================== | ||
+ | * BUILD NOTES: | ||
+ | * ======================================================================== | ||
+ | * Build with -DUSE_SERIAL=1 to compile with original serial code. | ||
+ | * Build with -DUSE_CUDA_A2=1 to compile with Assignment 2 kernel. | ||
+ | * Build with -DUSE_CUDA_A3=1 to compile with optimized Assignment 3 kernel. | ||
+ | * ======================================================================== | ||
+ | * Team 0xCAFEBABE | ||
+ | */ | ||
+ | |||
+ | void reportTime(const char* msg, steady_clock::duration span) { | ||
+ | std::cout << std::fixed; | ||
+ | std::cout << msg << " - took - " << duration_cast<milliseconds>(span).count() << " ms" << std::endl; | ||
+ | } | ||
+ | |||
+ | #ifdef USE_CUDA_A2 | ||
+ | __global__ void sortColours(int* a, int n) { | ||
+ | int zero = 0, two = n - 1; | ||
+ | for(int i = 0; i <= two; i++) { | ||
+ | if (a[i] == 0){ | ||
+ | int t = a[zero]; | ||
+ | a[zero++] = a[i]; | ||
+ | a[i] = t; | ||
+ | } | ||
+ | if (a[i]==2){ | ||
+ | int t = a[two]; | ||
+ | a[two--] = a[i]; | ||
+ | a[i--] = t; | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | #else | ||
+ | /* Sort of a bucket sort implementation. */ | ||
+ | __global__ void sortColours(int* a, int n) { | ||
+ | /* Index for rebuilding the array. */ | ||
+ | int index = 0; | ||
+ | /* Shared memory between all three (3) threads. */ | ||
+ | __shared__ int count[3]; | ||
+ | /* Make the count zero (0) incase of any previous memory */ | ||
+ | count[threadIdx.x] = 0; | ||
+ | /* Wait for all threads. */ | ||
+ | __syncthreads(); | ||
+ | if(threadIdx.x == 0) { | ||
+ | /* Thread 0 will iterate to the first half. */ | ||
+ | for(int j = 0; j < (n / 2); j++) { | ||
+ | count[a[j]]++; | ||
+ | } | ||
+ | } else if(threadIdx.x == 1) { | ||
+ | /* Thread 1 will iterate starting from halfway. */ | ||
+ | for(int j = (n / 2); j < n; j++) { | ||
+ | count[a[j]]++; | ||
+ | } | ||
+ | index = count[0]; | ||
+ | } | ||
+ | /* Sync threads to get appropriate counts. */ | ||
+ | __syncthreads(); | ||
+ | /* Thread 2 needs the appropriate index for writing all '2' instances. */ | ||
+ | if(threadIdx.x == 2) { | ||
+ | index = (count[0] + count[1]); | ||
+ | } | ||
+ | /* Rebuild the array. */ | ||
+ | for(int j = index; j < (count[threadIdx.x] + index); j++) { | ||
+ | a[j] = threadIdx.x; | ||
+ | } | ||
+ | } | ||
+ | #endif | ||
+ | |||
+ | /* Old serial code */ | ||
+ | void swap(int*a, int*b) { | ||
+ | int t; | ||
+ | t=*a; | ||
+ | *a = *b; | ||
+ | *b = t; | ||
+ | } | ||
+ | /* Old serial code x2 */ | ||
+ | void sortColors(int a[], int n) { | ||
+ | int zero=0, two=n-1; | ||
+ | for(int i=0; i<=two; i++ ){ | ||
+ | if (a[i]==0){ | ||
+ | swap(&a[zero], &a[i]); | ||
+ | zero++; | ||
+ | } | ||
+ | if (a[i]==2){ | ||
+ | swap(&a[two], &a[i]); | ||
+ | two--; | ||
+ | i--; | ||
+ | } | ||
+ | } | ||
+ | |||
+ | } | ||
+ | |||
+ | void printArray(int a[], int n) { | ||
+ | for(int i = 0; i < n; i++){ | ||
+ | printf("%d ", a[i]); | ||
+ | } | ||
+ | printf("\n"); | ||
+ | } | ||
+ | |||
+ | int main(int argc, char** argv) { | ||
+ | steady_clock::time_point ts, te; | ||
+ | #ifndef USE_SERIAL | ||
+ | cudaError_t err; | ||
+ | #endif | ||
+ | /* If no 'n' is specified, use a default of 10. */ | ||
+ | int n = 10; | ||
+ | if (argc > 1) { | ||
+ | n = atoi(argv[1]); | ||
+ | } | ||
+ | srand(time(NULL)); | ||
+ | |||
+ | int* a = new int[n]; | ||
+ | int* d_a = nullptr; | ||
+ | |||
+ | for (int i = 0; i < n; i++){ | ||
+ | /* Adjusting the randomness slightly to be more realistic. */ | ||
+ | a[i] = random() % 3; | ||
+ | } | ||
+ | #ifndef USE_SERIAL | ||
+ | /* Dynamically allocate memory on GPU. */ | ||
+ | err = cudaMalloc((void**) &d_a, n * sizeof(int)); | ||
+ | if(err != cudaSuccess) { | ||
+ | std::cerr << "Error allocating memory via CUDA!\n"; | ||
+ | return -1; | ||
+ | } | ||
+ | |||
+ | /* Copy contents from host integer array to device's. */ | ||
+ | err = cudaMemcpy((void*) d_a, a, n * sizeof(int), cudaMemcpyHostToDevice); | ||
+ | if(err != cudaSuccess) { | ||
+ | std::cerr << "Error copying contents to device!\n"; | ||
+ | cudaFree(d_a); | ||
+ | return -1; | ||
+ | } | ||
+ | #endif | ||
+ | /* Print before it's sorted. */ | ||
+ | //printArray(a, n); | ||
+ | ts = steady_clock::now(); | ||
+ | #ifndef USE_SERIAL | ||
+ | /* Call the kernel with one thread. */ | ||
+ | sortColours<<<1,3>>>(d_a, n); | ||
+ | |||
+ | /* Wait for kernel to finish. */ | ||
+ | cudaDeviceSynchronize(); | ||
+ | #else | ||
+ | sortColors(a, n); | ||
+ | #endif | ||
+ | te = steady_clock::now(); | ||
+ | |||
+ | #ifndef USE_SERIAL | ||
+ | /* Copy result from device to host. */ | ||
+ | err = cudaMemcpy((void*) a, d_a, n * sizeof(int), cudaMemcpyDeviceToHost); | ||
+ | if(err != cudaSuccess) { | ||
+ | std::cerr << "Error copying contents to host from device!\n"; | ||
+ | cudaFree(d_a); | ||
+ | return -1; | ||
+ | } | ||
+ | |||
+ | /* Free allocated memory. */ | ||
+ | cudaFree(d_a); | ||
+ | #endif | ||
+ | /* Print after it's sorted. */ | ||
+ | // printArray(a, n); | ||
+ | |||
+ | delete[] a; | ||
+ | reportTime("Colour Sort", te - ts); | ||
+ | } | ||
+ | |||
+ | </source> |
Latest revision as of 04:00, 16 December 2015
Contents
0xCAFEBABE
Team Members
Progress
Assignment 1
Profile 1: Solving unsteady 1D heat transfer equation
For this project, the code analyzed was to solve a problem with constant value boundary condition.
The code was borrowed from Dr. Pengfei Du, and I had to split up his class calls into function calls, respectively, to help me better analyze it.
Flat profile: Each sample counts as 0.01 seconds.
% cumulative self self total time seconds seconds calls Ts/call Ts/call name 0.03 0.01 0.01 1 0.00 0.00 _GLOBAL__sub_I__Z5setupR14pdu1dsteadyExp 99.78 1.03 1.02 1 1.01 1.01 _GLOBAL__sub_I__Z5solvingR15pdu1dsteadyExp
As shown, most of the time taken was in the solving function, which took care of calculating the wave.
Here is the result in a graphical animation: http://pengfeidu.net/Media/gif/1d-animated.gif
- Martin Ristov
Profile 2: Solving a Rubiks Cube
My choice of an application to profile is a Rubiks cube solver. While a Rubiks cube is typically thought of as a 3x3 cube, you can exponentially increase the number of squares. By doing so, solving a much larger Rubiks cube on a typical CPU becomes useless.
I came across a github repo for a (GPL v3) Rubiks cube solver at: https://github.com/brownan/Rubiks-Cube-Solver
I compiled Brown's code with profiling flags and got some interesting results.
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total time seconds seconds calls s/call s/call name 44.24 301.99 301.99 1768367109 0.00 0.00 cube_turn 41.94 588.30 286.31 1768367110 0.00 0.00 edge_hash2 13.51 680.53 92.24 1 92.24 682.02 edge_generate 0.10 681.24 0.71 132162051 0.00 0.00 stack_push 0.06 681.67 0.43 cube_120convert 0.05 682.03 0.36 edge_hash1 0.04 682.33 0.31 132162010 0.00 0.00 stack_peek_cube 0.03 682.52 0.19 132162010 0.00 0.00 stack_peek_distance 0.03 682.70 0.18 132162051 0.00 0.00 stack_pop 0.02 682.81 0.11 132162010 0.00 0.00 stack_peek_turn 0.00 682.82 0.02 _GLOBAL__sub_I_65535_0_cube_solved 0.00 682.82 0.00 1 0.00 0.00 edge_write 0.00 682.82 0.00 1 0.00 682.02 make_edge
cube_turn and edge_hash2 are the top two hotspots.
cube_turn - https://github.com/brownan/Rubiks-Cube-Solver/blob/master/cube.c#L181
edge_hash2 - https://github.com/brownan/Rubiks-Cube-Solver/blob/master/edgetable.c#L227
I feel like either two of these hotspots would be an excellent opportunity to parallelize.
~ Steven
Profile 3: Solving a Maze
I decided to profile an application which generates a maze according to the user provided rows and columns and then provides with the solution for the generated maze. As a maze can quickly rise in complexity when dealing with larger number of rows and columns, I believe that this type of an application could be a perfect fit to parallelize and make use of a much powerful GPU.
The above application has been taken form the following github repo : https://github.com/Syntaf/MazeSolver
Flat Profile :
Each sample counts as 0.01 seconds.
% cumulative self self total time seconds seconds calls ms/call ms/call name 99.97 13.00 13.00 mazeGen::printMazeData(std::string) const 0.08 13.01 0.01 1 10.00 10.00 AStar::findPath(int const&, int const&, int, int const&, std::vector<std::vector<int, std::allocator<int> >, std::allocator<std::vector<int, std::allocator<int> > > >) 0.00 13.01 0.00 159198 0.00 0.00 disjointSets::setFind(int) 0.00 13.01 0.00 70742 0.00 0.00 void std::__adjust_heap<__gnu_cxx::__normal_iterator<node*, std::vector<node, std::allocator<node> > >, long, node, std::less<node> >(__gnu_cxx::__normal_iterator<node*, std::vector<node, std::allocator<node> > >, long, long, node, std::less<node>) 0.00 13.01 0.00 39999 0.00 0.00 disjointSets::setUnion(int, int) 0.00 13.01 0.00 4433 0.00 0.00 void std::vector<int, std::allocator<int> >::_M_emplace_back_aux<int>(int&&) 0.00 13.01 0.00 803 0.00 0.00 void std::vector<char, std::allocator<char> >::emplace_back<char>(char&&) [clone .constprop.131] 0.00 13.01 0.00 10 0.00 0.00 void std::vector<std::vector<char, std::allocator<char> >, std::allocator<std::vector<char, std::allocator<char> > > >::_M_emplace_back_aux<std::vector<char, std::allocator<char> > const&>(std::vector<char, std::allocator<char> > const&) 0.00 13.01 0.00 9 0.00 0.00 void std::vector<node, std::allocator<node> >::_M_emplace_back_aux<node const&>(node const&) 0.00 13.01 0.00 1 0.00 0.00 _GLOBAL__sub_I__ZN12disjointSetsC2Ei 0.00 13.01 0.00 1 0.00 0.00 _GLOBAL__sub_I__ZN4node14updatePriorityERKiS1_ 0.00 13.01 0.00 1 0.00 0.00 _GLOBAL__sub_I_main 0.00 13.01 0.00 1 0.00 0.00 disjointSets::disjointSets(int) 0.00 13.01 0.00 1 0.00 0.00 disjointSets::~disjointSets() 0.00 13.01 0.00 1 0.00 0.00 AStar::makeReadyMap(std::vector<std::vector<char, std::allocator<char> >, std::allocator<std::vector<char, std::allocator<char> > > > const&)
As you can see the maximum time spent by the CPU is while generating the maze, due to a number of nested loops which evaluate heavy computation of the maze data
- Luv
Profile 4: Color Sorter
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total time seconds seconds calls s/call s/call name 49.79 9.61 9.61 main 39.15 17.16 7.55 1 7.55 9.69 sortColors(int*, int) 11.06 19.30 2.13 334007584 0.00 0.00 swap(int*, int*) 0.00 19.30 0.00 2 0.00 0.00 printArray(int*, int)
- Theo Dule
Assignment 2
During our initial attempt at parallelizing a Rubiks Cube solver, we came across issues with compiling with the CUDA compiler. The issue turned out to be compiling CUDA code with C. Our first attempt at resolving this issue was by adding extern "C" to all affected files. While that resolved some linking errors, we were still unable to properly link it. We decided to instead turn our attention to Theo's colour sorter that he posted for Assignment 1. We were able to successfully parallelize it with a single thread. The performance increase from serial to CUDA was extremely significant to the point to where we had to increase n to 10^10 to actually measure the difference. As we progress to part 3 of these assignments, we see a constraint of the sorter being fixed to digits from 0 to 2. We hope to remove this restriction in conjunction with optimizing it.
Assignment 3
// Source : https://oj.leetcode.com/problems/sort-colors/
// Author : Hao Chen
// Date : 2014-06-25
/**********************************************************************************
*
* Given an array with n objects colored red, white or blue, sort them so that objects of
* the same color are adjacent, with the colors in the order red, white and blue.
*
* Here, we will use the integers 0, 1, and 2 to represent the color red, white, and blue respectively.
*
* Note:
* You are not suppose to use the library's sort function for this problem.
*
* Follow up:
* > A rather straight forward solution is a two-pass algorithm using counting sort.
* > First, iterate the array counting number of 0's, 1's, and 2's, then overwrite array
* with total number of 0's, then 1's and followed by 2's.
* > Could you come up with an one-pass algorithm using only constant space?
*
**********************************************************************************/
#include <time.h>
/* Added */
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
using namespace std::chrono;
/* ========================================================================
* BUILD NOTES:
* ========================================================================
* Build with -DUSE_SERIAL=1 to compile with original serial code.
* Build with -DUSE_CUDA_A2=1 to compile with Assignment 2 kernel.
* Build with -DUSE_CUDA_A3=1 to compile with optimized Assignment 3 kernel.
* ========================================================================
* Team 0xCAFEBABE
*/
void reportTime(const char* msg, steady_clock::duration span) {
std::cout << std::fixed;
std::cout << msg << " - took - " << duration_cast<milliseconds>(span).count() << " ms" << std::endl;
}
#ifdef USE_CUDA_A2
__global__ void sortColours(int* a, int n) {
int zero = 0, two = n - 1;
for(int i = 0; i <= two; i++) {
if (a[i] == 0){
int t = a[zero];
a[zero++] = a[i];
a[i] = t;
}
if (a[i]==2){
int t = a[two];
a[two--] = a[i];
a[i--] = t;
}
}
}
#else
/* Sort of a bucket sort implementation. */
__global__ void sortColours(int* a, int n) {
/* Index for rebuilding the array. */
int index = 0;
/* Shared memory between all three (3) threads. */
__shared__ int count[3];
/* Make the count zero (0) incase of any previous memory */
count[threadIdx.x] = 0;
/* Wait for all threads. */
__syncthreads();
if(threadIdx.x == 0) {
/* Thread 0 will iterate to the first half. */
for(int j = 0; j < (n / 2); j++) {
count[a[j]]++;
}
} else if(threadIdx.x == 1) {
/* Thread 1 will iterate starting from halfway. */
for(int j = (n / 2); j < n; j++) {
count[a[j]]++;
}
index = count[0];
}
/* Sync threads to get appropriate counts. */
__syncthreads();
/* Thread 2 needs the appropriate index for writing all '2' instances. */
if(threadIdx.x == 2) {
index = (count[0] + count[1]);
}
/* Rebuild the array. */
for(int j = index; j < (count[threadIdx.x] + index); j++) {
a[j] = threadIdx.x;
}
}
#endif
/* Old serial code */
void swap(int*a, int*b) {
int t;
t=*a;
*a = *b;
*b = t;
}
/* Old serial code x2 */
void sortColors(int a[], int n) {
int zero=0, two=n-1;
for(int i=0; i<=two; i++ ){
if (a[i]==0){
swap(&a[zero], &a[i]);
zero++;
}
if (a[i]==2){
swap(&a[two], &a[i]);
two--;
i--;
}
}
}
void printArray(int a[], int n) {
for(int i = 0; i < n; i++){
printf("%d ", a[i]);
}
printf("\n");
}
int main(int argc, char** argv) {
steady_clock::time_point ts, te;
#ifndef USE_SERIAL
cudaError_t err;
#endif
/* If no 'n' is specified, use a default of 10. */
int n = 10;
if (argc > 1) {
n = atoi(argv[1]);
}
srand(time(NULL));
int* a = new int[n];
int* d_a = nullptr;
for (int i = 0; i < n; i++){
/* Adjusting the randomness slightly to be more realistic. */
a[i] = random() % 3;
}
#ifndef USE_SERIAL
/* Dynamically allocate memory on GPU. */
err = cudaMalloc((void**) &d_a, n * sizeof(int));
if(err != cudaSuccess) {
std::cerr << "Error allocating memory via CUDA!\n";
return -1;
}
/* Copy contents from host integer array to device's. */
err = cudaMemcpy((void*) d_a, a, n * sizeof(int), cudaMemcpyHostToDevice);
if(err != cudaSuccess) {
std::cerr << "Error copying contents to device!\n";
cudaFree(d_a);
return -1;
}
#endif
/* Print before it's sorted. */
//printArray(a, n);
ts = steady_clock::now();
#ifndef USE_SERIAL
/* Call the kernel with one thread. */
sortColours<<<1,3>>>(d_a, n);
/* Wait for kernel to finish. */
cudaDeviceSynchronize();
#else
sortColors(a, n);
#endif
te = steady_clock::now();
#ifndef USE_SERIAL
/* Copy result from device to host. */
err = cudaMemcpy((void*) a, d_a, n * sizeof(int), cudaMemcpyDeviceToHost);
if(err != cudaSuccess) {
std::cerr << "Error copying contents to host from device!\n";
cudaFree(d_a);
return -1;
}
/* Free allocated memory. */
cudaFree(d_a);
#endif
/* Print after it's sorted. */
// printArray(a, n);
delete[] a;
reportTime("Colour Sort", te - ts);
}