</syntaxhighlight>
<br>
'''Main function
<syntaxhighlight lang="cpp">
// Main routine that executes on the host
int main(int argc, char** argv) {
// interpret command-line argument
if (argc != 2) {
std::cerr << argv[0] << ": invalid number of arguments\n";
return 1;
}
float n = std::atoi(argv[1]);
int nblocks = 30;
steady_clock::time_point ts, te;
dim3 dimGrid(nblocks, 1, 1); // Grid dimensions
dim3 dimBlock(ntpb, 1, 1); // Block dimensions
float *sumHost, *sumDev; // Pointer to host & device arrays
float step = 1.0 / n; // Step size
size_t size = nblocks*ntpb * sizeof(float); //Array memory size
sumHost = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **)&sumDev, size); // Allocate array on device
// Initialize array in device to 0
cudaMemset(sumDev, 0, size);
// initialization
std::srand(std::time(nullptr));
ts = steady_clock::now();
// Do calculation on device
calculate << <dimGrid, dimBlock >> > (sumDev, n, step, ntpb, nblocks); // call CUDA kernel
te = steady_clock::now();
cudaMemcpy(sumHost, sumDev, size, cudaMemcpyDeviceToHost);
for (tid = 0; tid<ntpb*nblocks; tid++)
pi += sumHost[tid];
pi *= step;
// Print results
printf("Number of iterations= %f\nPI = %f\n", n,pi);
reportTime("Pi calculation took ", te - ts);
// Cleanup
free(sumHost);
cudaFree(sumDev);
return 0;
}
</syntaxhighlight>
'''Results CPU vs GPU
<br>