Open main menu

CDOT Wiki β

Changes

0xCAFEBABE

69 bytes added, 04:00, 16 December 2015
Assignment 3
=== Assignment 3 ===
<source lang="cpp">
// Source : https://oj.leetcode.com/problems/sort-colors/
 
// Author : Hao Chen
 
// Date : 2014-06-25
/**********************************************************************************
  * Given an array with n objects colored red, white or blue, sort them so that objects of  * the same color are adjacent, with the colors in the order red, white and blue.  * Here, we will use the integers 0, 1, and 2 to represent the color red, white, and blue respectively.  * Note:  * You are not suppose to use the library's sort function for this problem.  * Follow up:  * > A rather straight forward solution is a two-pass algorithm using counting sort.  * > First, iterate the array counting number of 0's, 1's, and 2's, then overwrite array  * with total number of 0's, then 1's and followed by 2's.  * > Could you come up with an one-pass algorithm using only constant space?  **********************************************************************************/
#include <time.h>
 
/* Added */
 
#include <iostream>
 
#include <cuda_runtime.h>
 
#include <chrono>
 
using namespace std::chrono;
 
/* ========================================================================
  * BUILD NOTES:  * ========================================================================  * Build with -DUSE_SERIAL=1 to compile with original serial code.  * Build with -DUSE_CUDA_A2=1 to compile with Assignment 2 kernel.  * Build with -DUSE_CUDA_A3=1 to compile with optimized Assignment 3 kernel.  * ========================================================================  * Team 0xCAFEBABE  */
void reportTime(const char* msg, steady_clock::duration span) {
  std::cout << std::fixed;  std::cout << msg << " - took - " << duration_cast<milliseconds>(span).count() << " ms" << std::endl; 
}
#ifdef USE_CUDA_A2
 
__global__ void sortColours(int* a, int n) {
  int zero = 0, two = n - 1;  for(int i = 0; i <= two; i++) {  if (a[i] == 0){  int t = a[zero];  a[zero++] = a[i]; a[i] = t; } if (a[i] = =2){ int t= a[two]; a[two--] = a[i]; a[i--] = t; } }
}
 
if (a[i]==2){
 
int t = a[two];
 
a[two--] = a[i];
 
a[i--] = t;
 
}
 
}
 
}
 
#else
 
/* Sort of a bucket sort implementation. */
 
__global__ void sortColours(int* a, int n) {
  /* Index for rebuilding the array. */  int index = 0;  /* Shared memory between all three (3) threads. */  __shared__ int count[3];  /* Make the count zero (0) incase of any previous memory */  count[threadIdx.x] = 0;  /* Wait for all threads. */  __syncthreads();  if(threadIdx.x == 0) {  /* Thread 0 will iterate to the first half. */  for(int j = 0; j < (n / 2); j++) { count[a[j]]++; } } else if(threadIdx.x == 1) { /* Thread 1 will iterate starting from halfway. */ for(int j = (n / 2); j < n; j++) { count[a[j]]++; } index = count[0]; } /* Sync threads to get appropriate counts. */ __syncthreads(); /* Thread 2 needs the appropriate index for writing all '2' instances. */ if(threadIdx.x == 2) { index = (count[0] + count[1]); } /* Rebuild the array. */ for(int j = index; j < (count[threadIdx.x] + index); j++) { a[j] = threadIdx.x; }
}
 
} else if(threadIdx.x == 1) {
 
/* Thread 1 will iterate starting from halfway. */
 
for(int j = (n / 2); j < n; j++) {
 
count[a[j]]++;
 
}
 
index = count[0];
 
}
 
/* Sync threads to get appropriate counts. */
 
__syncthreads();
 
/* Thread 2 needs the appropriate index for writing all '2' instances. */
 
if(threadIdx.x == 2) {
 
index = (count[0] + count[1]);
 
}
 
/* Rebuild the array. */
 
for(int j = index; j < (count[threadIdx.x] + index); j++) {
 
a[j] = threadIdx.x;
 
}
 
}
 
#endif
/* Old serial code */
 
void swap(int*a, int*b) {
  int t;  t=*a;  *a = *b;  *b = t; 
}
 
/* Old serial code x2 */
 
void sortColors(int a[], int n) {
  int zero=0, two=n-1;  for(int i=0; i<=two; i++ ){  if (a[i]==0){  swap(&a[zero], &a[i]);  zero++;  if (a[i]==2){  swap(&a[two], &a[i]);  two--;  i--;  }
}
void printArray(int a[], int n) {
  for(int i = 0; i < n; i++){  printf("%d ", a[i]);  printf("\n"); 
}
int main(int argc, char** argv) {
  steady_clock::time_point ts, te; 
#ifndef USE_SERIAL
  cudaError_t err; 
#endif
  /* If no 'n' is specified, use a default of 10. */  int n = 10;  if (argc > 1) {  n = atoi(argv[1]);  srand(time(NULL)); int* a = new int[n];  int* d_a = nullptr; for (int i = 0; i < n; i++){  /* Adjusting the randomness slightly to be more realistic. */  a[i] = random() % 3; 
#ifndef USE_SERIAL
  /* Dynamically allocate memory on GPU. */  err = cudaMalloc((void**) &d_a, n * sizeof(int));  if(err != cudaSuccess) {  std::cerr << "Error allocating memory via CUDA!\n";  return -1;  } /* Copy contents from host integer array to device's. */  err = cudaMemcpy((void*) d_a, a, n * sizeof(int), cudaMemcpyHostToDevice);  if(err != cudaSuccess) {  std::cerr << "Error copying contents to device!\n";  cudaFree(d_a);  return -1; 
#endif
  /* Print before it's sorted. */  //printArray(a, n);  ts = steady_clock::now(); 
#ifndef USE_SERIAL
  /* Call the kernel with one thread. */  sortColours<<<1,3>>>(d_a, n); /* Wait for kernel to finish. */  cudaDeviceSynchronize(); 
#else
  sortColors(a, n); 
#endif
  te = steady_clock::now();
#ifndef USE_SERIAL
  /* Copy result from device to host. */  err = cudaMemcpy((void*) a, d_a, n * sizeof(int), cudaMemcpyDeviceToHost);  if(err != cudaSuccess) {  std::cerr << "Error copying contents to host from device!\n";  cudaFree(d_a);  return -1; } /* Free allocated memory. */ cudaFree(d_a);#endif /* Print after it's sorted. */ // printArray(a, n); delete[] a; reportTime("Colour Sort", te - ts);
}
</* Free allocated memory. */ cudaFree(d_a); #endif /* Print after it's sorted. */ // printArray(a, n); delete[] a; reportTime("Colour Sort", te - ts); }source>