25
edits
Changes
→Assignment 2
=== Assignment 2 ===
Offloading to the GPU results in a pi calculation time to be reduced
<br>
[[File:Pi_calculation.png]]
<br>
'''Kernel code used
<br>
<syntaxhighlight lang="cpp">
__global__ void cal_pi(float *sum, int nbin, float step, int nthreads, int nblocks) {
int i;
float x;
int idx = blockIdx.x*blockDim.x + threadIdx.x; // Sequential thread index across the blocks
for (i = idx; i< nbin; i += nthreads*nblocks) {
x = (i + 0.5)*step;
sum[idx] += 4.0 / (1.0 + x*x);
}
}
</syntaxhighlight>
<br>
'''Main function
<syntaxhighlight lang="cpp">
int main(void) {
steady_clock::time_point ts, te;
dim3 dimGrid(NUM_BLOCK, 1, 1); // Grid dimensions
dim3 dimBlock(NUM_THREAD, 1, 1); // Block dimensions
float *sumHost, *sumDev; // Pointer to host & device arrays
float step = 1.0 / NBIN; // Step size
size_t size = NUM_BLOCK*NUM_THREAD * sizeof(float); //Array memory size
sumHost = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **)&sumDev, size); // Allocate array on device
// Initialize array in device to 0
cudaMemset(sumDev, 0, size);
// Do calculation on device
ts = steady_clock::now();
cal_pi << <dimGrid, dimBlock >> > (sumDev, NBIN, step, NUM_THREAD, NUM_BLOCK); // call CUDA kernel
// Retrieve result from device and store it in host array
te = steady_clock::now();
reportTime("Time Calculating Pi ", te - ts);
cudaMemcpy(sumHost, sumDev, size, cudaMemcpyDeviceToHost);
for (tid = 0; tid<NUM_THREAD*NUM_BLOCK; tid++)
pi += sumHost[tid];
pi *= step;
// Print results
printf("PI = %f\n", pi);
// Cleanup
free(sumHost);
cudaFree(sumDev);
return 0;
}
</syntaxhighlight>
=== Assignment 3 ===