94
edits
Changes
→The parallelizing Express
== Assignment 3 ==
=== Optimized Kernel ===
<pre>
__global__ void matvec_kernel(float* d_A, float* d_RGB2, float* d_LMS2, float* d_C,
const int n, int targetrows, int targetcols, float* d_Tar)
{
const double eps = 1.0e-4;
//grid-stride loop
for (int tid = threadIdx.x + blockIdx.x * blockDim.x;
tid < targetrows;
tid += blockDim.x * gridDim.x)
{
for (int x = 0; x < targetcols; ++x) {
memcpy(&d_A, &d_Tar[tid * 3 + x], 3 * sizeof(float));
matvec(d_A, d_RGB2, d_C);
memcpy(&d_A, d_C, 3 * sizeof(float));
for (int c = 0; c < 3; c++)
d_A[c] = d_A[c] > -5.0 ? pow((double)10.0, (double)d_A[c]) : eps;
matvec(d_A, d_LMS2, d_C);
memcpy(&d_Tar[tid * 3 + x], d_C, 3 * sizeof(float));
}
}
}
</pre>
=== Results ===
[[File:a3timings.PNG]]