Changes

BarraCUDA Boiz

1,347 bytes removed, 02:49, 11 April 2017

→‎Analysis

indices(i, 0) = (int)minidx;

count((int)minidx, 0) += 1;

}

~~__global__ void centersAdd(cv::cuda::PtrStepSz<float> centers, cv::cuda::PtrStepSz<float> samples, cv::cuda::PtrStepSz<int> indices, int N, int dim) {~~

~~int col = blockIdx.x * blockDim.x + threadIdx.x;~~

~~int row = blockIdx.y * blockDim.y + threadIdx.y;~~

~~int i = col + row * N;~~

~~if (i < N) {~~

~~int index = indices(i, 0);~~

~~for (int d = 0; d < dim; d++) {~~

~~centers(index, d) += samples(i, d);~~

}

~~__global__ void centersDivide(cv::cuda::PtrStepSz<float> centers, cv::cuda::PtrStepSz<int> count, int ncluster, int dim) {~~

~~int col = blockIdx.x * blockDim.x + threadIdx.x;~~

~~int row = blockIdx.y * blockDim.y + threadIdx.y;~~

~~int i = col + row * ncluster;~~

~~if (i >= ncluster)~~

~~return;~~

~~for (int d = 0; d<dim; d++) {~~

~~centers(i, d) /= (float)count(i, 0);~~

}

~~__global__ void zeroFloat(cv::cuda::PtrStepSz<float> out) {~~

~~int i = blockIdx.y*blockDim.y + threadIdx.y;~~

~~int j = blockIdx.x*blockDim.x + threadIdx.x;~~

~~if (i >= out.rows || j >= out.cols)~~

~~return;~~

~~out(i, j) = 0.0f;~~

}

~~__global__ void zeroInt(cv::cuda::PtrStepSz<int> out) {~~

~~int i = blockIdx.y*blockDim.y + threadIdx.y;~~

~~int j = blockIdx.x*blockDim.x + threadIdx.x;~~

~~if (i >= out.rows || j >= out.cols)~~

~~return;~~

~~out(i, j) = 0;~~

}

After programming this kernel. we noticed an improvement in performace. ~~Here are is a graph comparing the run-times of the serial program vs parallelized.~~ ~~[[File:Assignment2Graph.png]]~~

==== Conclusion ====

Mamichalski

36

edits

Changes

BarraCUDA Boiz

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools