Changes

Group 6

5,421 bytes added, 17:00, 7 April 2019

→‎Assignment 3 - Optimize

----

=== Assignment 3 - Optimize ===

Here is my final source code

{| class="wikitable mw-collapsible mw-collapsed"

! p03_reduction.cu

|-

|

<pre>

// part 3.1 : reduction

// update 2:

// add comments to all kernels

// mdf kernel 2 only returns the numbers of dot inside the quadrant, and this number passes to next blocks

// new kernel 3 sums the elements of d_a as generated by the kernel 2, and accumulate the block sums

// new kernel 4 sums all block PI value before passing back to host

#include<iostream>

#include<fstream>

#include<math.h>

#include<stdlib.h>

#include<time.h>

#include <chrono>

#include <cstdlib>

#include <iomanip>

#include <cuda_runtime.h>

#include <curand_kernel.h>

// to remove intellisense highlighting

#include <device_launch_parameters.h>

#ifndef __CUDACC__

#define __CUDACC__

#endif

#include <device_functions.h>

using namespace std;

using namespace std::chrono;

const int ntpb = 512;

// this function uses to calculate PI on CPU

void calculatePI(int n, float* h_a) {

float x, y;

int hit;

srand(time(NULL));

for (int j = 0; j < n; j++) {

hit = 0;

x = 0;

y = 0;

for (int i = 0; i < n; i++) {

x = float(rand()) / float(RAND_MAX);

y = float(rand()) / float(RAND_MAX);

if (y <= sqrt(1 - (x * x))) {

hit += 1;

}

h_a[j] = 4 * float(hit) / float(n);

}

// kernel 1

// The first kernel uses to generate random numbers

__global__ void setRng(curandState *rng) {

int idx = blockIdx.x * blockDim.x + threadIdx.x;

curand_init(123456, idx, 0, &rng[idx]);

}

// kernel 2

// The second kernel identifis the dot location (use the kernel 1 passed random number to create)

// whether it is been in the quadrant or not

__global__ void calPI(float* d_a, int n, curandState *rng) {

int idx = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int counter = 0; // this variable counts the total number of dot be placed

unsigned int hit = 0; // this variable counts the number of dot inside the cirle

// in one Threat, it generates n dots

while (counter < n) {

float x = curand_uniform(&rng[idx]);

float y = curand_uniform(&rng[idx]);

if (y*y <= (1 - (x * x))) {

hit++;

}

counter++;

}

d_a[idx] = 4.0 * (float(hit)) / float(n);

}

// kernel 3

// the third kernel sum the result in each block

__global__ void sumPi(float* d_a, float*d_b, const int n) {

int i = blockIdx.x * blockDim.x + threadIdx.x;

int t = threadIdx.x;

__shared__ float s[ntpb];

s[t] = d_a[i];

__syncthreads();

// sum the data in shared memory

for (int stride = 1; stride < blockDim.x; stride <<= 1) {

if ((t % (2 * stride) == 0) && (i + stride < n)) {

s[t] += s[t + stride];

}

__syncthreads();

}

// store the sum in d_b;

if (t == 0) {

d_b[blockIdx.x] = s[0];

}

// kernel 4

// the forth kernel sum the result of all blocks

__global__ void accumulate(float* c, const int nblocks) {

// store the elements of c[] in shared memory

int i = blockIdx.x * blockDim.x + threadIdx.x;

int t = threadIdx.x;

__shared__ float s[ntpb];

s[t] = c[i];

__syncthreads();

// sum the data in shared memory

for (int stride = 1; stride < blockDim.x; stride <<= 1) {

if ((t % (2 * stride) == 0) && (i + stride < nblocks)) {

s[t] += s[t + stride];

}

__syncthreads();

}

// store the sum in c[0]

if (t == 0) {

c[blockIdx.x] = s[0];

}

void reportTime(const char* msg, steady_clock::duration span) {

auto ms = duration_cast<milliseconds>(span);

std::cout << msg << " took - " <<

ms.count() << " millisecs" << std::endl;

}

int main(int argc, char* argv[]) {

if (argc != 2) {

std::cerr << argv[0] << ": invalid number of arguments\n";

std::cerr << "Usage: " << argv[0] << " size_of_matrices\n";

return 1;

}

int n = std::atoi(argv[1]); // scale

int nblks = (n + ntpb - 1) / ntpb;

cout << "scale: " << n << endl << endl;

steady_clock::time_point ts, te;

float* cpu_a;

cpu_a = new float[n];

ts = steady_clock::now();

calculatePI(n, cpu_a);

te = steady_clock::now();

reportTime("CPU", te - ts);

ofstream h_file;

h_file.open("h_result.txt");

float cpuSum = 0.0f;

for (int i = 0; i < n; i++) {

cpuSum += cpu_a[i];

h_file << "Host: " << cpu_a[i] << endl;

}

cpuSum = cpuSum / (float)n;

cout << "CPU Result: " << cpuSum << endl;

h_file.close();

cout << endl;

////////////////////////////////////////

curandState *d_rng;

float* d_a;

float* d_b;

float* h_a;

h_a = new float[n];

cudaMalloc((void**)&d_a, n * sizeof(float));

cudaMalloc((void**)&d_b, n * sizeof(float));

cudaMalloc((void**)&d_rng, n * sizeof(curandState));

ts = steady_clock::now();

setRng << < nblks, ntpb >> > (d_rng);

cudaDeviceSynchronize();

// calculate PI in each thread and pass its value

calPI << <nblks, ntpb >> > (d_a, n, d_rng);

cudaDeviceSynchronize();

// sum PI in total and pass back on the device

sumPi << <nblks, ntpb >> > (d_a, d_b, n);

cudaDeviceSynchronize();

// accumulate the block sums

accumulate << <1, nblks >> >(d_b, nblks);

cudaDeviceSynchronize();

te = steady_clock::now();

reportTime("GPU", te - ts);

// host h_a only receives one element from device d_b

cudaMemcpy(h_a, d_b, n * sizeof(float), cudaMemcpyDeviceToHost);

ofstream d_file;

d_file.open("d_result.txt");

float gpuSum = 0.0f;

gpuSum = h_a[0] / (float)n;

cout << "GPU Result: " << gpuSum << "\n \n"<< endl;

d_file.close();

cudaFree(d_a);

cudaFree(d_rng);

delete[] cpu_a;

delete[] h_a;

// reset the device

cudaDeviceReset();

}

</pre>

|}

Xhuang110

57

edits

CDOT Wiki β

Changes

Group 6

CDOT Wiki ^β