Changes

Jump to: navigation, search

Kernal Blas

852 bytes removed, 13:41, 29 March 2018
Assignment 2
<syntaxhighlight lang="cpp">
__global__ void cal_picalculate(float *sum, int nbin, float step, int nthreads, int nblocks) {
int i;
float x;
int idx = blockIdx.x*blockDim.x + threadIdx.x; // Sequential thread index across the blocks
for (i = idx; i< nbin; i += nthreads*nblocks) {
x = (i + 0.5)*step;
sum[idx] += 4.0 / (1.0 + x*x);
}
}
</syntaxhighlight>
<br>
'''Main function
<syntaxhighlight lang="cpp">
// Using CUDA device to calculate pi
#include <stdio.h>
#include <cuda.h>
#include <iostream>
#include <ctime>
#include <chrono>
#include <cstdlib>
using namespace std::chrono;
 
//#define NUM_BLOCK 30 // Number of thread blocks
const int ntpb = 8; // Number of threads per block
int tid;
float pi = 0;
 
// Kernel that executes on the CUDA device
__global__ void calculate(float *sum, int nbin, float step, int nthreads, int nblocks) {
int i;
float x;
int idx = blockIdx.x * blockDim.x + threadIdx.x; // Sequential thread index across the blocks
for (i = idx; i< nbin; i += nthreads*nblocks) {
x = (i + 0.5)*step;
sum[idx] += 4.0 / (1.0 + x*x);
}
}
 
void reportTime(const char* msg, steady_clock::duration span) {
auto ms = duration_cast<milliseconds>(span);
std::cout << msg << " - took - " <<
ms.count() << " millisecs" << std::endl;
}
 
 
 
// Main routine that executes on the host
int main(int argc, char** argv) {
96
edits

Navigation menu