Changes

Kernal Blas

211 bytes added, 12:38, 29 March 2018

→‎Assignment 2

'''Main function

// Main routine that executes on the hostint main(~~void~~int argc, char** argv) { // interpret command-line argument if (argc != 2) { std::cerr << argv[0] << ": invalid number of arguments\n"; return 1; } float n = std::atoi(argv[1]); int nblocks = 30;

steady_clock::time_point ts, te;

dim3 dimGrid(~~NUM_BLOCK~~nblocks, 1, 1); // Grid dimensions dim3 dimBlock(~~NUM_THREAD~~ntpb, 1, 1); // Block dimensions

float *sumHost, *sumDev; // Pointer to host & device arrays

float step = 1.0 / ~~NBIN~~n; // Step size size_t size = ~~NUM_BLOCK~~nblocks*~~NUM_THREAD~~ ntpb * sizeof(float); //Array memory size

sumHost = (float *)malloc(size); // Allocate array on host

cudaMalloc((void **)&sumDev, size); // Allocate array on device

// Initialize array in device to 0

cudaMemset(sumDev, 0, size);

// initialization

std::srand(std::time(nullptr));

ts = steady_clock::now();

// Do calculation on device

~~ts = steady_clock::now();~~ ~~cal_pi~~ calculate << <dimGrid, dimBlock >> > (sumDev, ~~NBIN~~n, step, ~~NUM_THREAD~~ntpb, ~~NUM_BLOCK~~nblocks); // call CUDA kernel ~~// Retrieve result from device and store it in host array~~

te = steady_clock::now();

~~reportTime("Time Calculating Pi ", te - ts);~~

cudaMemcpy(sumHost, sumDev, size, cudaMemcpyDeviceToHost);

for (tid = 0; tid<~~NUM_THREAD~~ntpb*~~NUM_BLOCK~~nblocks; tid++)

pi += sumHost[tid];

pi *= step;

// Print results

printf("PI Number of iterations= %f\nPI = %f\n", n, pi); reportTime("Pi calculation took ", te - ts);

// Cleanup

Jpham14

96

edits

Changes

Kernal Blas

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools