1
edit
Changes
→Assignment 3
Furthermore, we found that if the sample number n is less that 1024, we can use shared memory in the kernel.
==== Reduce memory copy function calls Memory Copy Function Calls ==== ===== Assignment 2 Source Code =====<syntaxhighlight lang="c++"> while(j<tTotal/dt){ solutionNew[0]=10; solutionNew[xGridNum]=120; cudaMemcpy(d_x, x, (xGridNum + 1) * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_solutionLast, solutionLast, xGridNum * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_solutionNew, solutionNew, xGridNum * sizeof(double), cudaMemcpyHostToDevice); kernel<<<(xGridNum + ntpb - 1) / ntpb, ntpb>>>(d_solutionNew, d_solutionLast, d_x, xGridNum, dt, dx, K, j); j++; cudaMemcpy(x, d_x, (xGridNum + 1) * sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(solutionLast, d_solutionLast, xGridNum * sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(solutionNew, d_solutionNew, xGridNum * sizeof(double), cudaMemcpyDeviceToHost); solutionLast=solutionNew; } }</syntaxhighlight> ===== Improved Source Code =====<syntaxhighlight lang="c++"> while( j < tTotal / dt){ kernel<<<(xGridNum + ntpb - 1) / ntpb, ntpb>>>(d_solutionA, d_solutionB, d_x, xGridNum, dt, dx, K, j); j++; cudaMemcpy(solutionNew, d_solutionA, xGridNum * sizeof(double), cudaMemcpyDeviceToHost); /* myfile << "Time" << tTotal/dt << std::endl; for(int i = 0; i <= xGridNum; i++){ myfile << solutionNew[i] << ":"; } */ d_solutionTemp = d_solutionA; d_solutionA = d_solutionB; d_solutionB = d_solutionTemp; }<syntaxhighlight>