Changes

DPS915/M-N-M

3,069 bytes added, 15:51, 12 April 2013

→‎Assignment 3

[[Image:gpuA3error.png|thumb|widthpx| ]]

==== Final Cuda version ====

<pre>

#include <cstdio>

#include <cstdlib>

#include <iostream>

#include <ctime>

#include <cuda_runtime.h>

using namespace std;

/**

* This macro checks return value of the CUDA runtime call and exits

* the application if the call failed.

*/

#define CUDA_CHECK_RETURN(value) { \

cudaError_t _m_cudaStat = value; \

if (_m_cudaStat != cudaSuccess) { \

fprintf(stderr, "Error %s at line %d in file %s\n", \

cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \

exit(1); \

} }

/**

* Kernel code to generate and detect primes

*/

__global__ void prime(int *num, int blockNum, int threadNum, int size) {

const int tid = blockIdx.x * blockDim.x + threadIdx.x;

const int bid = blockIdx.y * blockDim.y + threadIdx.y;

__syncthreads();

/**

* Generate prime numbers and store them in the array.

* The first element is always 2

*/

if(tid == 0) {

num[tid] = 2;

} else {

num[tid] = 2 * tid + 1;

}

int tmp = bid * threadNum + tid;

int step1 = 2 * tmp + 3;

int step2 = tmp + 1;

while(tmp < size) {

int i = 1;

/**

* Check if an element is not prime, if it isn't set it to 0.

*/

while((step1 * i + step2) < size) {

num[step1 * i + step2] = 0;

i++;

}

tmp += blockNum * threadNum;

__syncthreads();

}

int main(int argc, char* argv[]) {

if(argc != 2) {

cout << "Incorrect no of arguments" << endl;

return 1;

}

int n = atoi(argv[1]);

/**

* variable declarations

*/

int *device;

int host[n];

int d;

cudaDeviceProp prop;

/**

* Get the properties of the device in use

*/

cudaGetDevice(&d);

cudaGetDeviceProperties(&prop, d);

int numberOfBlocks = 8;

int maxThreadsPerBlock = prop.maxThreadsPerBlock;

int numberOfThreads = maxThreadsPerBlock/numberOfBlocks;

/**

* Start timer

*/

clock_t cb, ce;

cb = clock();

/**

* Allocate memory on the device

*/

CUDA_CHECK_RETURN(cudaMalloc((void**) &device, sizeof(int) * n));

/**

* Call kernel with appropriate grid and thread size

*/

prime<<<numberOfBlocks, numberOfThreads>>>(device, numberOfBlocks, numberOfThreads, n);

/**

* Copy results back to host

*/

CUDA_CHECK_RETURN(cudaMemcpy(&host, device, sizeof(int) * n, cudaMemcpyDeviceToHost));

/**

* Free memory on device

*/

CUDA_CHECK_RETURN(cudaFree(device));

/**

* Output values

*/

for (int i = 0; i < n; i++)

if (host[i] != 0)

cout << host[i] << endl;

/**

* Stop timer

*/

ce = clock();

cout << "Prime generation - took " << double(ce - cb)/CLOCKS_PER_SEC << " seconds" << endl;

}

</pre>

Mohamed Baig

1

edit

Changes

DPS915/M-N-M

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools