94
edits
Changes
→Assignment 2
'''Removing CPU Bottleneck'''
<pre>
''' Added functions and changes'''
To fix this issue We wrote a device function which handles a matrix by vector multiplication and made . We also wrote a few minor adjustments kernel which has equal logic to the host version so we could off load all the needed data to the main loop that deals with kernel without needing to do multiple back and forth(s). We also wrote a helper function which will allocate all transfer all the color shift opencv matrices to a suitable form for the target (image kernel to be modified)deal with.
Matrix by vector
<pre>
__device__ void sgemvmatvec(const float* h_Ad_A, const float* h_Bd_B, float* h_C, int nd_C) { // level 2 calculation: C float sum = alpha * A * x + B * y float* devPtrA0; float* devPtrB; float* devPtrC; // ... allocate memory on the device cudaMallocfor ((void**)&devPtrA, n * n * sizeof(float))int i = 0; cudaMalloc((void**)&devPtrB, i < n * sizeof(float)); cudaMalloc((void**)&devPtrC, n * sizeof(float)); // ... create cuBLAS context cublasHandle_t handle; cublasStatus_t status; status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS++i) { std::cerr << "sum += d_A[i] *d_B[(i **cublasCreate failed***\n"; return) + tid];
}
d_C[0] = sum;}<// ... copy host matrices to the devicepre> Kernel<pre> status = cublasSetMatrix__global__ void matvec_kernel(nfloat* d_A, nfloat* d_RGB2, sizeof(float)* d_LMS2, h_Afloat* d_C, const int n, devPtrAint targetrows, nint targetcols, float* d_Tar){ const double eps = 1.0e-4; if for (status !int y = CUBLAS_STATUS_SUCCESS0; y < targetrows; ++y) { std::cerr for (int x = 0; x << "targetcols; ++x) { memcpy(&d_A, h_Tar[y *3 + x], N *sizeof(float)); matvec(&d_A, &d_RGB2, d_C); memcpy(&d_A, h_C, N *cublasSetMatrix A failedsizeof(float)); for (int c = 0; c < 3; c++) d_A[c] = d_A[c] > -5.0 ? pow((double)10.0, (double)d_A[c]) : eps; matvec(&d_A, &d_LMS2, d_C); memcpy(&h_Tar[y *3 + x], d_C, N **\n"sizeof(float)); return;}
}
}</pre> Helper<pre> inline void vecTransfer(float* h, Color3d* v){ status for (int j = cublasSetMatrix(n sizeof0; j < 3; ++j) h[j] = v->v[j];} //KERNEL Helper function does setup and launchvoid matvec_L(cv::Mat* mRGB2LMS, cv::Mat* mLMS2lab, float* h_C, int tarrow, int tarcol, float* h_Tar){ float *h_A, h_B*h_RGB2, *h_LMS2, *h_C; float *d_A, n*d_RGB2, devPtrB*d_LMS2, n*d_C; int N = 3; h_A = (float*)malloc(sizeof(float) * N); h_RGB2 = new float[mRGB2LMS->total()]; if h_LMS2 = new float[mLMS2LMS->total(status !)]; h_C = CUBLAS_STATUS_SUCCESS(float*)malloc(sizeof(float) {* N); cudaMalloc((void**)&d_A, sizeof(float) * N); std::cerr << " cudaMalloc((void**)&d_RGB2, sizeof(float) *N *N); cudaMalloc((void*cublasSetVector B failed*)&d_LMS2, sizeof(float) *N *\n"N); return cudaMalloc((void**)&d_C, sizeof(float) * N); }Color3d vec; // ... calculate copy vec and matrixto host pointers vecTransfer(h_A, vec); memcpy(h_RGB2, mRGB2LMS->data, mRGB2LMS-vector product>total()); int ld_d_A = memcpy(h_LMS2, mLMS2Lab->data, mLMS2Lab->total()); cudaMemcpy(d_A, h_A, sizeof(float) * N, cudaMemcpyHostToDevice); cudaMemcpy(d_RGB2, h_RGB2, sizeof(float) * N * N, cudaMemcpyHostToDevice); cudaMemcpy(d_LMS2, h_LMS2, sizeof(float) * N * N, cudaMemcpyHostToDevice); matvec_kernel<<<N / BLOCK_SIZE + 1, BLOCK_SIZE>>>(d_A, d_RGB2, d_LMS2, d_C, N); //printf("error code: %s\n",cudaGetErrorString(cudaGetLastError())); cudaMemcpy(h_C, d_C, sizeof(float) * N, cudaMemcpyDeviceToHost); free(h_A); free(h_RGB2); int ld_d_B = nfree(h_LMS2); int ld_d_C = nfree(h_C);
}
</pre>
Changes to main loop
Old
<pre>
}
</pre>
New
<pre>
</pre>