42
edits
Changes
→Assignment 3
=== Assignment 3 ===
// Source : https://oj.leetcode.com/problems/sort-colors/
// Author : Hao Chen
// Date : 2014-06-25
/**********************************************************************************
*
* Given an array with n objects colored red, white or blue, sort them so that objects of
* the same color are adjacent, with the colors in the order red, white and blue.
*
* Here, we will use the integers 0, 1, and 2 to represent the color red, white, and blue respectively.
*
* Note:
* You are not suppose to use the library's sort function for this problem.
*
* Follow up:
* > A rather straight forward solution is a two-pass algorithm using counting sort.
* > First, iterate the array counting number of 0's, 1's, and 2's, then overwrite array
* with total number of 0's, then 1's and followed by 2's.
* > Could you come up with an one-pass algorithm using only constant space?
*
**********************************************************************************/
#include <time.h>
/* Added */
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
using namespace std::chrono;
/* ========================================================================
* BUILD NOTES:
* ========================================================================
* Build with -DUSE_SERIAL=1 to compile with original serial code.
* Build with -DUSE_CUDA_A2=1 to compile with Assignment 2 kernel.
* Build with -DUSE_CUDA_A3=1 to compile with optimized Assignment 3 kernel.
* ========================================================================
* Team 0xCAFEBABE
*/
void reportTime(const char* msg, steady_clock::duration span) {
std::cout << std::fixed;
std::cout << msg << " - took - " << duration_cast<milliseconds>(span).count() << " ms" << std::endl;
}
#ifdef USE_CUDA_A2
__global__ void sortColours(int* a, int n) {
int zero = 0, two = n - 1;
for(int i = 0; i <= two; i++) {
if (a[i] == 0){
int t = a[zero];
a[zero++] = a[i];
a[i] = t;
}
if (a[i]==2){
int t = a[two];
a[two--] = a[i];
a[i--] = t;
}
}
}
#else
/* Sort of a bucket sort implementation. */
__global__ void sortColours(int* a, int n) {
/* Index for rebuilding the array. */
int index = 0;
/* Shared memory between all three (3) threads. */
__shared__ int count[3];
/* Make the count zero (0) incase of any previous memory */
count[threadIdx.x] = 0;
/* Wait for all threads. */
__syncthreads();
if(threadIdx.x == 0) {
/* Thread 0 will iterate to the first half. */
for(int j = 0; j < (n / 2); j++) {
count[a[j]]++;
}
} else if(threadIdx.x == 1) {
/* Thread 1 will iterate starting from halfway. */
for(int j = (n / 2); j < n; j++) {
count[a[j]]++;
}
index = count[0];
}
/* Sync threads to get appropriate counts. */
__syncthreads();
/* Thread 2 needs the appropriate index for writing all '2' instances. */
if(threadIdx.x == 2) {
index = (count[0] + count[1]);
}
/* Rebuild the array. */
for(int j = index; j < (count[threadIdx.x] + index); j++) {
a[j] = threadIdx.x;
}
}
#endif
/* Old serial code */
void swap(int*a, int*b) {
int t;
t=*a;
*a = *b;
*b = t;
}
/* Old serial code x2 */
void sortColors(int a[], int n) {
int zero=0, two=n-1;
for(int i=0; i<=two; i++ ){
if (a[i]==0){
swap(&a[zero], &a[i]);
zero++;
}
if (a[i]==2){
swap(&a[two], &a[i]);
two--;
i--;
}
}
}
void printArray(int a[], int n) {
for(int i = 0; i < n; i++){
printf("%d ", a[i]);
}
printf("\n");
}
int main(int argc, char** argv) {
steady_clock::time_point ts, te;
#ifndef USE_SERIAL
cudaError_t err;
#endif
/* If no 'n' is specified, use a default of 10. */
int n = 10;
if (argc > 1) {
n = atoi(argv[1]);
}
srand(time(NULL));
int* a = new int[n];
int* d_a = nullptr;
for (int i = 0; i < n; i++){
/* Adjusting the randomness slightly to be more realistic. */
a[i] = random() % 3;
}
#ifndef USE_SERIAL
/* Dynamically allocate memory on GPU. */
err = cudaMalloc((void**) &d_a, n * sizeof(int));
if(err != cudaSuccess) {
std::cerr << "Error allocating memory via CUDA!\n";
return -1;
}
/* Copy contents from host integer array to device's. */
err = cudaMemcpy((void*) d_a, a, n * sizeof(int), cudaMemcpyHostToDevice);
if(err != cudaSuccess) {
std::cerr << "Error copying contents to device!\n";
cudaFree(d_a);
return -1;
}
#endif
/* Print before it's sorted. */
//printArray(a, n);
ts = steady_clock::now();
#ifndef USE_SERIAL
/* Call the kernel with one thread. */
sortColours<<<1,3>>>(d_a, n);
/* Wait for kernel to finish. */
cudaDeviceSynchronize();
#else
sortColors(a, n);
#endif
te = steady_clock::now();
#ifndef USE_SERIAL
/* Copy result from device to host. */
err = cudaMemcpy((void*) a, d_a, n * sizeof(int), cudaMemcpyDeviceToHost);
if(err != cudaSuccess) {
std::cerr << "Error copying contents to host from device!\n";
cudaFree(d_a);
return -1;
}
/* Free allocated memory. */
cudaFree(d_a);
#endif
/* Print after it's sorted. */
// printArray(a, n);
delete[] a;
reportTime("Colour Sort", te - ts);
}