Changes

Kernal Blas

2,544 bytes added, 19:18, 1 April 2018

→‎Assignment 3

=== Assignment 3 ===

The code we used to optimize our code

#include <stdlib.h>

#include <stdio.h>

#include <cuda.h>

#include <math.h>

#include <time.h>

#include <curand_kernel.h>

#include <cstdlib>

#define BLOCKS 30

#define THREADS 8

#define PI 3.1415926535 // known value of pi

__global__ void gpu_monte_carlo(float *estimate, curandState *states, float n) {

unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;

float points_in_circle = 0;

float x, y;

curand_init(1234, tid, 0, &states[tid]); // Initialize CURAND

for (int i = 0; i < n; i++) {

x = curand_uniform(&states[tid]);

y = curand_uniform(&states[tid]);

points_in_circle += (x*x + y*y <= 1.0f); // count if x & y is in the circle.

}

estimate[tid] = 4.0f * points_in_circle / n; // return estimate of pi

}

float host_monte_carlo(long trials) {

float x, y;

long points_in_circle = 0;

for (long i = 0; i < trials; i++) {

x = rand() / (float)RAND_MAX;

y = rand() / (float)RAND_MAX;

points_in_circle += (x*x + y*y <= 1.0f);

}

return 4.0f * points_in_circle / trials;

}

int main(int argc, char** argv) {

clock_t start, stop;

curandState *devStates;

float n = std::atoi(argv[1]);

dim3 dimGrid(BLOCKS, 1, 1); // Grid dimensions

dim3 dimBlock(THREADS, 1, 1); // Block dimensions

float *dev, *host;

size_t size = BLOCKS * THREADS* sizeof(float); //Array memory size

printf("# of trials per thread = %.0f, # of blocks = %d, # of threads/block = %d.\n", n,

BLOCKS, THREADS);

start = clock();

host = (float *)malloc(size); // Allocate array on host

cudaMalloc((void **)&dev, size); // Allocate array on device

cudaMemset(dev, 0, size);

cudaMalloc((void **)&devStates, THREADS * BLOCKS * sizeof(curandState));

gpu_monte_carlo << <BLOCKS, THREADS >> >(dev, devStates, n);

cudaMemcpy(host, dev, size, cudaMemcpyDeviceToHost); // return results

float pi_gpu = 0.0f;

for (int i = 0; i < BLOCKS * THREADS; i++) {

pi_gpu += host[i];

}

pi_gpu /= (BLOCKS * THREADS);

stop = clock();

printf("GPU pi calculated in %f s.\n", (stop - start) / (float)CLOCKS_PER_SEC);

start = clock();

//float pi_cpu = host_monte_carlo(BLOCKS * THREADS * n);

stop = clock();

printf("CPU pi calculated in %f s.\n", (stop - start) / (float)CLOCKS_PER_SEC);

printf("CUDA estimate of PI = %f [error of %f]\n", pi_gpu, pi_gpu - PI);

//printf("CPU estimate of PI = %f [error of %f]\n", pi_cpu, pi_cpu - PI);

return 0;

}

</codeblock>

How we optimize and improved the code is instead of using a randomized number we ask the user for input on pi calculation.

Sasioson

25

edits

CDOT Wiki β

Changes

Kernal Blas

CDOT Wiki ^β