Changes

Jump to: navigation, search

Kernal Blas

211 bytes added, 12:38, 29 March 2018
Assignment 2
'''Main function
<syntaxhighlight lang="cpp">
// Main routine that executes on the hostint main(voidint argc, char** argv) { // interpret command-line argument if (argc != 2) { std::cerr << argv[0] << ": invalid number of arguments\n"; return 1; } float n = std::atoi(argv[1]); int nblocks = 30; 
steady_clock::time_point ts, te;
dim3 dimGrid(NUM_BLOCKnblocks, 1, 1); // Grid dimensions dim3 dimBlock(NUM_THREADntpb, 1, 1); // Block dimensions
float *sumHost, *sumDev; // Pointer to host & device arrays
float step = 1.0 / NBINn; // Step size size_t size = NUM_BLOCKnblocks*NUM_THREAD ntpb * sizeof(float); //Array memory size
sumHost = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **)&sumDev, size); // Allocate array on device
// Initialize array in device to 0
cudaMemset(sumDev, 0, size);
// initialization
std::srand(std::time(nullptr));
 
ts = steady_clock::now();
 
// Do calculation on device
ts = steady_clock::now(); cal_pi calculate << <dimGrid, dimBlock >> > (sumDev, NBINn, step, NUM_THREADntpb, NUM_BLOCKnblocks); // call CUDA kernel // Retrieve result from device and store it in host array
te = steady_clock::now();
reportTime("Time Calculating Pi ", te - ts);
cudaMemcpy(sumHost, sumDev, size, cudaMemcpyDeviceToHost);
  for (tid = 0; tid<NUM_THREADntpb*NUM_BLOCKnblocks; tid++)
pi += sumHost[tid];
pi *= step;
// Print results
printf("PI Number of iterations= %f\nPI = %f\n", n, pi); reportTime("Pi calculation took ", te - ts);  
// Cleanup
96
edits

Navigation menu