Difference between revisions of "Carlos"

From CDOT Wiki
Jump to: navigation, search
(Assignment 2)
(Progress)
 
(3 intermediate revisions by the same user not shown)
Line 121: Line 121:
  
  
---- Profiling Results for the summarizedAreaTable() function ------
+
---- Profiling Results for the summedAreaTable() function ------
  
  
Line 145: Line 145:
  
 
=== Assignment 2 ===
 
=== Assignment 2 ===
 +
 +
Here is my code to parallelize the SAT algorithm I worked on A1:
  
 
<pre>
 
<pre>
Line 441: Line 443:
 
     300                  3.08                        0.2124
 
     300                  3.08                        0.2124
 
     400                  9.66                        0.6549
 
     400                  9.66                        0.6549
 
 
     500                  24.17                        1.58
 
     500                  24.17                        1.58
 
 
     600                  54.4                        3.268
 
     600                  54.4                        3.268
 
 
     700                113.17                        5.976
 
     700                113.17                        5.976
  
Line 451: Line 450:
  
 
</pre>
 
</pre>
 +
 +
=== Assignment 3 ===
 +
 +
Due to the difficulty of optimizing the code provided in A2 to parallelize a Summed Area Table, The professor and I accorded for me to only provide an explanation of how would I optimize my code by using a prefix sum method: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html.

Latest revision as of 14:59, 15 April 2013

Team Carlos

Team Members

  1. Carlos Conejo

Progress

Assignment 1

For my assignment 1, I profiled a Summarized Area Table code. Here is the code:


#include <iostream>

#include <cstdlib>



using namespace std;



/* Creates the Matrice */

void createMatrice(float** a, int size){

	for(int i = 0; i < size; i++)

		a[i] = new float[size];

}



/* Initializes the matrice to any random number between 1 and 9 */

void initializeMatrice(float** a, int size){

	float f = 1.0 / RAND_MAX;

	for(int i = 0; i < size; i++)

		for(int j = 0; j < size; j++)

			a[i][j] = rand() * f;

}



/* Creates the summarized area table */

void summarizedAreaTable(float** a, float** b, int size){

	int k = 0;

	float sum = 0.0;

	for(int i = size-1; i >= 0; i--){

		for(int j = 0; j < size; j++){

			for(int k = i; k < size; k++){

				for(int m = 0; m <= j; m++){

					sum += a[k][m];

				}

			}

			b[i][j] = sum;

			sum = 0.0;

		}

	}

}



int main(int argc, char* argv[]){

	if(argc == 2){ // only one argument (program name + one argument) allowed

		int size = atoi(argv[1]);

		float **a = new float*[size];

		float **b = new float*[size];



		createMatrice(a,size); // creates the matrice a

		createMatrice(b,size); // creates the matrice b

		initializeMatrice(a,size); // initializes the matrices

		summarizedAreaTable(a,b,size); // Does the SAT on a and stores it on b



		cout << "Finished" << endl;

		return 0;

	}

	else if(argc < 2)

		cout << "Please provide a size" << endl; // when no arguments are supplied

	else

		cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied

}


---- Profiling Results for the summedAreaTable() function ------



		Word Problem                          Seconds      

	            250                                  1.50

	            500                                 25.87

		    750                                173.99 

		   1000	                               658.34

   

--------------------------------------------------------------------




Assignment 2

Here is my code to parallelize the SAT algorithm I worked on A1:


#include <iostream>

#include <cstdlib>

#include <cuda_runtime.h>



using namespace std;



/* Initializes the matrix to any random number between 0 and 1 */

void initializeMatrix(float* a, int size){

	float f = 1.0 / RAND_MAX;

	for(int i = 0; i < size * size; i++)

			a[i] = rand() * f;

}



/* Displays the matrix */

void matrixDisplay(char matrix, float* a, int size){

	int i = 0;

	cout << matrix <<" is: " << endl;

	while(i < size*size){

		for(int j = 0; j < size; j++, i++)

			cout << a[i] << " ";

		cout << endl;

	}

}



/* Creates the Summed area table */

__global__ void SummedAreaTable(float* A, float* B, int size){

	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	float sum = 0;

	int rest;

	

	if(idx < (size*size)){

		if(idx < size)

			rest = idx;

		else{

			rest = idx - size;

			while(rest >= size)

				rest = rest - size;

		}

		for(int i = (size*size) - size + rest; i >= idx - rest; i -= size)

			for(int j = i, k = rest; k >= 0; j--, k--)

				sum += A[j];

		B[idx] = sum;

	}

}



int main(int argc, char* argv[]){

	if(argc == 2){ // only one argument (program name + one argument) allowed

		int size = atoi(argv[1]);

		int tSize = size * size;

		int d;

		int nThreads;

		int mThreads;

		int Blocks;

		int Threads;

		float *a = new float[tSize];

		float *A;

		float *b = new float[tSize];

		float *B;

		cudaError_t error; // error handler

		

		/* Gets the maximum number of threads and blocks */

		cudaDeviceProp prop;

		cudaGetDevice(&d);

		cudaGetDeviceProperties(&prop, d);

		nThreads = prop.maxThreadsDim[0];

		mThreads = nThreads * prop.maxGridSize[0];

		

		/* Checks if the size of the matrix is less than the maximum number of threads */

		if((tSize) < nThreads){

			Blocks = 1;

			Threads = tSize;

		}

		/* Checks if the size of the matrix is greater than the maximum number of threads */

		else if((tSize) > nThreads){

			Blocks = (tSize + nThreads - 1) / nThreads;

			Threads = nThreads;

		}

		/* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */

		else if((tSize) > mThreads){

			tSize = mThreads;

			Blocks = (tSize + nThreads - 1) / nThreads;

			Threads = nThreads;

		}

		

		dim3 dGrid(Blocks, Blocks, 1); // sets the grids

		dim3 dBlock(Threads, Threads, 1); // sets the blocks

	

		

		initializeMatrix(a,size); // initializes the matrix a

		error = cudaMalloc((void**)&A, tSize * sizeof(float)); // allocates memory on the device for matrix A;

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

    }

		error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B;

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

    }

		error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

    }

		

		/* Performs the SAT on the device on A and stores it on B */

		SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b

		cudaDeviceSynchronize(); // synchronizes the host and the device

		error = cudaGetLastError();

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

			cudaFree(a);

			cudaFree(b);

			delete [] a;

			delete [] b;

			return 3;

	  }

		/* copies device matrix B into host matrix b */

		error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost); 

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

    }

		

		//matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a

		//matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b

		

		/* deallocates space in both host memory and device memory */

		cudaFree(A);

		cudaFree(B);

		delete [] a;

		delete [] b;

		cudaDeviceReset();

		cout << "Finished" << endl;

		return 0;

	}

	else if(argc < 2){

		cout << "Please provide a size" << endl; // when no arguments are supplied

		return 0;

	}

	else{

		cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied

		return 0;

	}

}


---- Profiling Results for the summedAreaTable() function ------



Word Problem          A1/CPU (Seconds)           A2/GPU (Seconds)      

    100                   0.03                        0.0034
    200                   0.61                        0.0445
    300                   3.08                        0.2124
    400                   9.66                        0.6549
    500                  24.17                        1.58
    600                  54.4                         3.268
    700                 113.17                        5.976

--------------------------------------------------------------------

Assignment 3

Due to the difficulty of optimizing the code provided in A2 to parallelize a Summed Area Table, The professor and I accorded for me to only provide an explanation of how would I optimize my code by using a prefix sum method: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html.