Changes

The parallelizing Express

2,919 bytes added, 19:36, 17 March 2017

→‎Assignment 2

=== Description ===

'''Removing CPU Bottleneck'''

Removing the old CPU bottleneck in the ColorTransfer/main.cpp:

}

return u;

}

</pre>

''' Added functions and changes'''

We wrote a matrix by vector multiplication and made a few minor adjustments to the main loop that deals with the color shift for the target (image to be modified).

Matrix by vector

<pre>

void sgemv(const float* h_A, const float* h_B, float* h_C, int n) {

// level 3 calculation: C = alpha * A * B + beta * C

float* devPtrA;

float* devPtrB;

float* devPtrC;

// ... allocate memory on the device

cudaMalloc((void**)&devPtrA, n * n * sizeof(float));

cudaMalloc((void**)&devPtrB, n * sizeof(float));

cudaMalloc((void**)&devPtrC, n * sizeof(float));

// ... create cuBLAS context

cublasHandle_t handle;

cublasStatus_t status;

status = cublasCreate(&handle);

if (status != CUBLAS_STATUS_SUCCESS) {

std::cerr << "***cublasCreate failed***\n";

return;

}

// ... copy host matrices to the device

status = cublasSetMatrix(n, n, sizeof(float), h_A, n, devPtrA, n);

if (status != CUBLAS_STATUS_SUCCESS) {

std::cerr << "***cublasSetMatrix A failed***\n";

return;

}

status = cublasSetMatrix(n sizeof(float), h_B, n, devPtrB, n);

if (status != CUBLAS_STATUS_SUCCESS) {

std::cerr << "***cublasSetVector B failed***\n";

return;

}

// ... calculate matrix-vector product

int ld_d_A = n;

int ld_d_B = n;

int ld_d_C = n;

float alpha = 1.0f;

float beta = 0.0f;

status = cublasSgemv(handle, CUBLAS_OP_N, n, n,

&alpha, devPtrA, ld_d_A, devPtrB, ld_d_B, &beta, devPtrC, ld_d_C);

if (status != CUBLAS_STATUS_SUCCESS) {

std::cerr << "***cublasSgemm failed***\n";

return;

}

// ... copy result matrix from the device to the host

status = cublasGetVector(n, sizeof(float), devPtrC, n, h_C, n);

if (status != CUBLAS_STATUS_SUCCESS) {

std::cerr << "***cublasGetVector C failed***\n";

return;

}

// ... destroy cuBLAS context

cublasDestroy(handle);

// ... deallocate device memory

cudaFree(&h_A);

cudaFree(&h_B);

cudaFree(&h_C);

}

</pre>

Changes to main loop

Old

<pre>

// Transform back from lab to RGB

for(int y=0; y<target.rows; y++) {

for(int x=0; x<target.cols; x++) {

v = target.at<Color3d>(y, x);

v = mlab2LMS * v;

for(int c=0; c<3; c++) v(c) = v(c) > -5.0 ? pow(10.0, v(c)) : eps;

target.at<Color3d>(y, x) = mLMS2RGB * v;

}

</pre>

New

<pre>

// allocate host memory

float* h_C = new float[3]; // result

// Transform back from lab to RGB

for(int y=0; y<target.rows; y++) {

for(int x=0; x<target.cols; x++) {

v = target.at<Color3d>(y, x);

sgemv(&mlab2LMS, v, h_c);

memcpy(v, h_c, sizeof(Color3d));

for(int c=0; c<3; c++)

v(c) = v(c) > -5.0 ? pow(10.0, v(c)) : eps;

sgemv(&mLMS2RGB , v, h_c);

memcpy(target.at<Color3d>(y, x), h_c, sizeof(Color3d));

}

</pre>

Mradmanovic

94

edits

CDOT Wiki β

Changes

The parallelizing Express

CDOT Wiki ^β