Changes

Jump to: navigation, search

BarraCUDA Boiz

2,798 bytes removed, 00:53, 14 April 2017
Progress
==== Problem ====
After surveying the original code. We found one three major hot-spots for heavy CPU usage.
This block of code handles reshapes input pixels into a set of samples for classification.
const int N = width * height;[[File:SetSamplesSerial.png]]   const int dim = imgThis block of code computes the distances between sampled centers and other input samples.channels();  cv[[File::Mat samples = cv::Mat(N, dim, CV_32FC1); for (int x = 0; x<width; x++) { for (int y = 0; y<height; y++) { for (int d = 0; d<dim; d++) {CalculateDistanceSerial.png|550px]] int index = y * width + x; samplesThis block of code generates the image that has to be outputted.at<float>(index, d) = (float)img.at<uchar>(y, x*dim + d); } } }[[File:GenerateImageSerial.png|550px]]
==== Analysis ====
You can find the new parallelized KmeansPlusPlus code
[https://github.com/agamdograMajinBui/KmeansPlusPlusCuda herekmeansplusplusCUDA].  Here are the kernels that we programmed.
Here is the Set Samples kernel that we programmed.
__global__ void setCenter(float* d_center, float* d_sample, int n, int dim, int randi) { int i = blockIdx[[File:SetSamplesKernel.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; if (i < n && j < n) d_center[j * n + ipng|550px] = d_sample[j * randi + i]; }
Launching the kernel
int nb = (n + ntpb - 1) / ntpb; dim3 dGrid(nb, nb, 1); dim3 dBlock(ntpb, ntpb, 1); float* d_center = nullptr; cudaMalloc((void**)&d_center, centers.rows * centers.cols * sizeof(float)); cudaMemcpy(d_center, (float*)centers.data, centers.rows * centers.cols * sizeof(float), cudaMemcpyHostToDevice); check(cudaGetLastError()); float* d_sample = nullptr; cudaMalloc((void**)&d_sample, samples.rows * samples.cols * sizeof(float)); cudaMemcpy(d_sample, (float*)samples.data, centers.rows * centers.cols * sizeof(float), cudaMemcpyHostToDevice); int rand = genrand_int31() % n; setCenter << <dGrid, dBlock >> >(d_center, d_sample, N, dim, rand); cudaDeviceSynchronize();Calculate Distance kernel
[[File:CalculateDistanceKernel.png|550px]]
The kernels:Generate Image kernel
__global__ void setSamples(cv[[File::cuda::PtrStepSz<float> samples, cv::cuda::PtrStepSz<uchar> img, int dimC) { int i = blockIdx.y*blockDim.y + threadIdx.y; int j = blockIdx.x*blockDim.x + threadIdx.x; if (i >= imgGenerateImageKernel.rows |png| j >= img.cols) return; int index = i * img.cols + j; for (int d = 0; d<dimC; d++) { samples(index, d) = (float)img(i, j * dimC + d); } }550px]]
__global__ void calculateDistance(cv::cuda::PtrStepSz<float> centers, cv::cuda::PtrStepSz<float> samples, int k, int N, int dim, double* minval, float* D) {
// Compute distances between already sampled centers and other input samples.
// Update nearest distance if it is smaller than previous ones.
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int i = col + row * N;
//int i = blockIdx.y*blockDim.y + threadIdx.y;
if (i >= N)
return;
double dist = 0.0;
for (int d = 0; d<dim; d++) {
double diff = centers(k - 1, d) - samples(i, d);
dist += diff * diff;
}
if (dist < minval[i]) {
minval[i] = dist;
}
*D += minval[i];
}
__global__ void sampleClassification(cv::cuda::PtrStepSz<int> indices, cv::cuda::PtrStepSz<int> count, cv::cuda::PtrStepSz<float> centers, cv::cuda::PtrStepSz<float> samples, int N, int ncluster, int dim) {
// Sample classification
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int i = col + row * N;
if (i >= N)
return;
double minidx = 0;
double minval = 1000000000;
for (int k = 0; k<ncluster; k++) {
double dist = 0.0;
for (int d = 0; d<dim; d++) {
float diff = centers(k, d) - samples(i, d);
dist += diff * diff;
}
if (minval > dist) {
minval = dist;
minidx = k;
}
}
indices(i, 0) = (int)minidx;
count((int)minidx, 0) += 1;
}
__global__ void centersAdd(cv::cuda::PtrStepSz<float> centers, cv::cuda::PtrStepSz<float> samples, cv::cuda::PtrStepSz<int> indices, int N, int dim) { int col = blockIdx.x * blockDim.x + threadIdx.x; int row = blockIdx.y * blockDim.y + threadIdx.y; int i = col + row * N; if (i < N) { int index = indices(i, 0); for (int d Conclusion === 0; d < dim; d++) { centers(index, d) += samples(i, d); } } }
__global__ void centersDivide(cv::cuda::PtrStepSz<float> centersBy comparing the run-times of the serial KmeansPlusPlus and the parallelized version, cv::cuda::PtrStepSz<int> count, int ncluster, int dim) { int col = blockIdxwe can see that the performance of the program has improved.x * blockDim.x + threadIdx.x; int row = blockIdx.y * blockDim.y + threadIdx.y; int i = col + row * ncluster; if (i >= ncluster) return; for (int d = 0; d<dim; d++) { centers(i, d) /= (float)count(i, 0); } }
__global__ void zeroFloat(cv[[File::cuda::PtrStepSz<float> out) { int i = blockIdx.y*blockDim.y + threadIdx.y; int j = blockIdx.x*blockDim.x + threadIdx.x; if (i >= outGraphAssignment2.rows |png| j >= out.cols) return; out(i, j) = 0.0f; }900px]]
__global__ void zeroInt(cv::cuda::PtrStepSz<int> out) { int i = blockIdxThe performance improvement is not significant for smaller clusters and iterations.y*blockDimBut you can see that the performance has been improved for the higher test cases.y + threadIdx.y; int j = blockIdx.x*blockDim.x + threadIdx.x; if (i >= out.rows || j >= out.cols) return; out(i, j) = 0; }
__global__ void generateImage(cv::cuda::PtrStepSz<uchar> out, cv::cuda::PtrStepSz<int> indices, cv::cuda::PtrStepSz<float> centers, int dim) { // Generate output image int i = blockIdx.y*blockDim.y + threadIdx.y; int j = blockIdx.x*blockDim.x + threadIdx.x; if (i >= out.rows || j >Assignment 3 = out.cols) return; int index = i * out.cols + j; int ci = indices(index, 0); for (int d = 0; d<dim; d++) { out(i, j*dim + d) = (uchar)centers(ci, d); } }
After programming this For assignment 3, we optimized the kernels by allocating the correct amounts of grids and block for each kernel. Previously, we allocated 32 threads by 32 blocks for every kernelcall even when it did not require it. After adjustments, we noticed an improvement in performacefound significant improvements for many of the kernels.
Here are is a graph comparing the run-times of the serial program vs parallelized.
[[File:Assignment2Graph.png]]====Runtime of program====
Here, we see that the program was improved by the optimizations of threads per block. Runtime of program: For larger images, we found that the program was improved more and more as the amount of clusters and iterations increased. [[File:Big Image.png]] For medium images, we found more inconsistent results. [[File:Med Image.png]] For small images, we found the most inconsistent results after optimizations. [[File:Small Image.png]] When the image side increases, the more efficient the kernel. ==== Conclusion Runtime of each kernel==== Each kernel individually found significant or marginal improvements after adjusting for thread/block size. Runtime of kernels: Set samples found small improvements on average. [[File:Set Samples.png]] Here we changed the calculation of y_index to the outside of the inner loop. [[File:SetSamplesKernelOptimized.png|550px]] Calcuate distance found a significant improvements. [[File:Calculate Distance Kernel.png]]  The biggest change was the thread/block size. [[File:CalculateDistanceKernelOptimized.png|550px]]  Generate image found improvements as well since image sizes varied. Changing the thread/block size to the correct amount of pixels enabled better usage of memory. [[File:Generate Image Kernel.png]]
By comparing The biggest change was the run-times of the serial KmeansPlusPlus and the parallelized version, we can see that the performance of the program has improved slightlythread/block size.
This program can further be improved by off-loading some more operations from the CPU to the GPU. But this will require more time and research[[File:GenerateImageKernelOptimized.png|550px]]
52
edits

Navigation menu