20
edits
Changes
→Assignment 3
=== Assignment 3 ===
<source lang="cpp">
// Source : https://oj.leetcode.com/problems/sort-colors/
// Author : Hao Chen
// Date : 2014-06-25
/**********************************************************************************
* * Given an array with n objects colored red, white or blue, sort them so that objects of * the same color are adjacent, with the colors in the order red, white and blue. * * Here, we will use the integers 0, 1, and 2 to represent the color red, white, and blue respectively. * * Note: * You are not suppose to use the library's sort function for this problem. * * Follow up: * > A rather straight forward solution is a two-pass algorithm using counting sort. * > First, iterate the array counting number of 0's, 1's, and 2's, then overwrite array * with total number of 0's, then 1's and followed by 2's. * > Could you come up with an one-pass algorithm using only constant space? * **********************************************************************************/
#include <time.h>
/* Added */
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
using namespace std::chrono;
/* ========================================================================
* BUILD NOTES: * ======================================================================== * Build with -DUSE_SERIAL=1 to compile with original serial code. * Build with -DUSE_CUDA_A2=1 to compile with Assignment 2 kernel. * Build with -DUSE_CUDA_A3=1 to compile with optimized Assignment 3 kernel. * ======================================================================== * Team 0xCAFEBABE */
void reportTime(const char* msg, steady_clock::duration span) {
std::cout << std::fixed; std::cout << msg << " - took - " << duration_cast<milliseconds>(span).count() << " ms" << std::endl;
}
#ifdef USE_CUDA_A2
__global__ void sortColours(int* a, int n) {
int zero = 0, two = n - 1; for(int i = 0; i <= two; i++) { if (a[i] == 0){ int t = a[zero]; a[zero++] = a[i]; a[i] = t; } if (a[i] = =2){ int t= a[two]; a[two--] = a[i]; a[i--] = t; } }
}
#else
/* Sort of a bucket sort implementation. */
__global__ void sortColours(int* a, int n) {
/* Index for rebuilding the array. */ int index = 0; /* Shared memory between all three (3) threads. */ __shared__ int count[3]; /* Make the count zero (0) incase of any previous memory */ count[threadIdx.x] = 0; /* Wait for all threads. */ __syncthreads(); if(threadIdx.x == 0) { /* Thread 0 will iterate to the first half. */ for(int j = 0; j < (n / 2); j++) { count[a[j]]++; } } else if(threadIdx.x == 1) { /* Thread 1 will iterate starting from halfway. */ for(int j = (n / 2); j < n; j++) { count[a[j]]++; } index = count[0]; } /* Sync threads to get appropriate counts. */ __syncthreads(); /* Thread 2 needs the appropriate index for writing all '2' instances. */ if(threadIdx.x == 2) { index = (count[0] + count[1]); } /* Rebuild the array. */ for(int j = index; j < (count[threadIdx.x] + index); j++) { a[j] = threadIdx.x; }
}
#endif
/* Old serial code */
void swap(int*a, int*b) {
int t; t=*a; *a = *b; *b = t;
}
/* Old serial code x2 */
void sortColors(int a[], int n) {
int zero=0, two=n-1; for(int i=0; i<=two; i++ ){ if (a[i]==0){ swap(&a[zero], &a[i]); zero++; } if (a[i]==2){ swap(&a[two], &a[i]); two--; i--; } }
}
void printArray(int a[], int n) {
for(int i = 0; i < n; i++){ printf("%d ", a[i]); } printf("\n");
}
int main(int argc, char** argv) {
steady_clock::time_point ts, te;
#ifndef USE_SERIAL
cudaError_t err;
#endif
/* If no 'n' is specified, use a default of 10. */ int n = 10; if (argc > 1) { n = atoi(argv[1]); } srand(time(NULL)); int* a = new int[n]; int* d_a = nullptr; for (int i = 0; i < n; i++){ /* Adjusting the randomness slightly to be more realistic. */ a[i] = random() % 3; }
#ifndef USE_SERIAL
/* Dynamically allocate memory on GPU. */ err = cudaMalloc((void**) &d_a, n * sizeof(int)); if(err != cudaSuccess) { std::cerr << "Error allocating memory via CUDA!\n"; return -1; } /* Copy contents from host integer array to device's. */ err = cudaMemcpy((void*) d_a, a, n * sizeof(int), cudaMemcpyHostToDevice); if(err != cudaSuccess) { std::cerr << "Error copying contents to device!\n"; cudaFree(d_a); return -1; }
#endif
/* Print before it's sorted. */ //printArray(a, n); ts = steady_clock::now();
#ifndef USE_SERIAL
/* Call the kernel with one thread. */ sortColours<<<1,3>>>(d_a, n); /* Wait for kernel to finish. */ cudaDeviceSynchronize();
#else
sortColors(a, n);
#endif
te = steady_clock::now();
#ifndef USE_SERIAL
/* Copy result from device to host. */ err = cudaMemcpy((void*) a, d_a, n * sizeof(int), cudaMemcpyDeviceToHost); if(err != cudaSuccess) { std::cerr << "Error copying contents to host from device!\n"; cudaFree(d_a); return -1; } /* Free allocated memory. */ cudaFree(d_a);#endif /* Print after it's sorted. */ // printArray(a, n); delete[] a; reportTime("Colour Sort", te - ts);
}
</* Free allocated memory. */ cudaFree(d_a); #endif /* Print after it's sorted. */ // printArray(a, n); delete[] a; reportTime("Colour Sort", te - ts); }source>