Difference between revisions of "Carlos"

From CDOT Wiki
Jump to: navigation, search
(Progress)
(Assignment 2)
Line 146: Line 146:
  
 
=== Assignment 2 ===
 
=== Assignment 2 ===
 +
 +
<pre>
 +
 +
#include <iostream>
 +
 +
#include <cstdlib>
 +
 +
#include <cuda_runtime.h>
 +
 +
 +
 +
using namespace std;
 +
 +
 +
 +
/* Initializes the matrix to any random number between 0 and 1 */
 +
 +
void initializeMatrix(float* a, int size){
 +
 +
float f = 1.0 / RAND_MAX;
 +
 +
for(int i = 0; i < size * size; i++)
 +
 +
a[i] = rand() * f;
 +
 +
}
 +
 +
 +
 +
/* Displays the matrix */
 +
 +
void matrixDisplay(char matrix, float* a, int size){
 +
 +
int i = 0;
 +
 +
cout << matrix <<" is: " << endl;
 +
 +
while(i < size*size){
 +
 +
for(int j = 0; j < size; j++, i++)
 +
 +
cout << a[i] << " ";
 +
 +
cout << endl;
 +
 +
}
 +
 +
}
 +
 +
 +
 +
/* Creates the Summed area table */
 +
 +
__global__ void SummedAreaTable(float* A, float* B, int size){
 +
 +
int idx = blockIdx.x * blockDim.x + threadIdx.x;
 +
 +
float sum = 0;
 +
 +
int rest;
 +
 +
 +
 +
if(idx < (size*size)){
 +
 +
if(idx < size)
 +
 +
rest = idx;
 +
 +
else{
 +
 +
rest = idx - size;
 +
 +
while(rest >= size)
 +
 +
rest = rest - size;
 +
 +
}
 +
 +
for(int i = (size*size) - size + rest; i >= idx - rest; i -= size)
 +
 +
for(int j = i, k = rest; k >= 0; j--, k--)
 +
 +
sum += A[j];
 +
 +
B[idx] = sum;
 +
 +
}
 +
 +
}
 +
 +
 +
 +
int main(int argc, char* argv[]){
 +
 +
if(argc == 2){ // only one argument (program name + one argument) allowed
 +
 +
int size = atoi(argv[1]);
 +
 +
int tSize = size * size;
 +
 +
int d;
 +
 +
int nThreads;
 +
 +
int mThreads;
 +
 +
int Blocks;
 +
 +
int Threads;
 +
 +
float *a = new float[tSize];
 +
 +
float *A;
 +
 +
float *b = new float[tSize];
 +
 +
float *B;
 +
 +
cudaError_t error; // error handler
 +
 +
 +
 +
/* Gets the maximum number of threads and blocks */
 +
 +
cudaDeviceProp prop;
 +
 +
cudaGetDevice(&d);
 +
 +
cudaGetDeviceProperties(&prop, d);
 +
 +
nThreads = prop.maxThreadsDim[0];
 +
 +
mThreads = nThreads * prop.maxGridSize[0];
 +
 +
 +
 +
/* Checks if the size of the matrix is less than the maximum number of threads */
 +
 +
if((tSize) < nThreads){
 +
 +
Blocks = 1;
 +
 +
Threads = tSize;
 +
 +
}
 +
 +
/* Checks if the size of the matrix is greater than the maximum number of threads */
 +
 +
else if((tSize) > nThreads){
 +
 +
Blocks = (tSize + nThreads - 1) / nThreads;
 +
 +
Threads = nThreads;
 +
 +
}
 +
 +
/* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */
 +
 +
else if((tSize) > mThreads){
 +
 +
tSize = mThreads;
 +
 +
Blocks = (tSize + nThreads - 1) / nThreads;
 +
 +
Threads = nThreads;
 +
 +
}
 +
 +
 +
 +
dim3 dGrid(Blocks, Blocks, 1); // sets the grids
 +
 +
dim3 dBlock(Threads, Threads, 1); // sets the blocks
 +
 +
 +
 +
 +
 +
initializeMatrix(a,size); // initializes the matrix a
 +
 +
error = cudaMalloc((void**)&A, tSize * sizeof(float)); // allocates memory on the device for matrix A;
 +
 +
if (error != cudaSuccess) {
 +
 +
cout << cudaGetErrorString(error) << endl;
 +
 +
    }
 +
 +
error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B;
 +
 +
if (error != cudaSuccess) {
 +
 +
cout << cudaGetErrorString(error) << endl;
 +
 +
    }
 +
 +
error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A
 +
 +
if (error != cudaSuccess) {
 +
 +
cout << cudaGetErrorString(error) << endl;
 +
 +
    }
 +
 +
 +
 +
/* Performs the SAT on the device on A and stores it on B */
 +
 +
SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b
 +
 +
cudaDeviceSynchronize(); // synchronizes the host and the device
 +
 +
error = cudaGetLastError();
 +
 +
if (error != cudaSuccess) {
 +
 +
cout << cudaGetErrorString(error) << endl;
 +
 +
cudaFree(a);
 +
 +
cudaFree(b);
 +
 +
delete [] a;
 +
 +
delete [] b;
 +
 +
return 3;
 +
 +
  }
 +
 +
/* copies device matrix B into host matrix b */
 +
 +
error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost);
 +
 +
if (error != cudaSuccess) {
 +
 +
cout << cudaGetErrorString(error) << endl;
 +
 +
    }
 +
 +
 +
 +
//matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a
 +
 +
//matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b
 +
 +
 +
 +
/* deallocates space in both host memory and device memory */
 +
 +
cudaFree(A);
 +
 +
cudaFree(B);
 +
 +
delete [] a;
 +
 +
delete [] b;
 +
 +
cudaDeviceReset();
 +
 +
cout << "Finished" << endl;
 +
 +
return 0;
 +
 +
}
 +
 +
else if(argc < 2){
 +
 +
cout << "Please provide a size" << endl; // when no arguments are supplied
 +
 +
return 0;
 +
 +
}
 +
 +
else{
 +
 +
cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied
 +
 +
return 0;
 +
 +
}
 +
 +
}
 +
 +
 +
 +
Size
 +
A1  (CPU)
 +
A2  (GPU)
 +
 +
100
 +
0.03
 +
0.0034
 +
 +
200
 +
0.61
 +
0.0445
 +
 +
300
 +
3.08
 +
0.2124
 +
 +
400
 +
9.66
 +
0.6549
 +
 +
500
 +
24.17
 +
1.58
 +
 +
600
 +
54.4
 +
3.268
 +
 +
700
 +
113.17
 +
5.976
 +
 +
</pre>

Revision as of 11:54, 8 March 2013

Team Carlos

Team Members

  1. Carlos Conejo

Progress

Assignment 1

For my assignment 1, I profiled a Summarized Area Table code. Here is the code:


#include <iostream>

#include <cstdlib>



using namespace std;



/* Creates the Matrice */

void createMatrice(float** a, int size){

	for(int i = 0; i < size; i++)

		a[i] = new float[size];

}



/* Initializes the matrice to any random number between 1 and 9 */

void initializeMatrice(float** a, int size){

	float f = 1.0 / RAND_MAX;

	for(int i = 0; i < size; i++)

		for(int j = 0; j < size; j++)

			a[i][j] = rand() * f;

}



/* Creates the summarized area table */

void summarizedAreaTable(float** a, float** b, int size){

	int k = 0;

	float sum = 0.0;

	for(int i = size-1; i >= 0; i--){

		for(int j = 0; j < size; j++){

			for(int k = i; k < size; k++){

				for(int m = 0; m <= j; m++){

					sum += a[k][m];

				}

			}

			b[i][j] = sum;

			sum = 0.0;

		}

	}

}



int main(int argc, char* argv[]){

	if(argc == 2){ // only one argument (program name + one argument) allowed

		int size = atoi(argv[1]);

		float **a = new float*[size];

		float **b = new float*[size];



		createMatrice(a,size); // creates the matrice a

		createMatrice(b,size); // creates the matrice b

		initializeMatrice(a,size); // initializes the matrices

		summarizedAreaTable(a,b,size); // Does the SAT on a and stores it on b



		cout << "Finished" << endl;

		return 0;

	}

	else if(argc < 2)

		cout << "Please provide a size" << endl; // when no arguments are supplied

	else

		cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied

}


---- Profiling Results for the summarizedAreaTable() function ------



		Word Problem                          Seconds      

			250                                  1.50

			500                                 25.87

			750                                173.99 

		   1000	                               658.34

   

--------------------------------------------------------------------





Assignment 2


#include <iostream>

#include <cstdlib>

#include <cuda_runtime.h>



using namespace std;



/* Initializes the matrix to any random number between 0 and 1 */

void initializeMatrix(float* a, int size){

	float f = 1.0 / RAND_MAX;

	for(int i = 0; i < size * size; i++)

			a[i] = rand() * f;

}



/* Displays the matrix */

void matrixDisplay(char matrix, float* a, int size){

	int i = 0;

	cout << matrix <<" is: " << endl;

	while(i < size*size){

		for(int j = 0; j < size; j++, i++)

			cout << a[i] << " ";

		cout << endl;

	}

}



/* Creates the Summed area table */

__global__ void SummedAreaTable(float* A, float* B, int size){

	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	float sum = 0;

	int rest;

	

	if(idx < (size*size)){

		if(idx < size)

			rest = idx;

		else{

			rest = idx - size;

			while(rest >= size)

				rest = rest - size;

		}

		for(int i = (size*size) - size + rest; i >= idx - rest; i -= size)

			for(int j = i, k = rest; k >= 0; j--, k--)

				sum += A[j];

		B[idx] = sum;

	}

}



int main(int argc, char* argv[]){

	if(argc == 2){ // only one argument (program name + one argument) allowed

		int size = atoi(argv[1]);

		int tSize = size * size;

		int d;

		int nThreads;

		int mThreads;

		int Blocks;

		int Threads;

		float *a = new float[tSize];

		float *A;

		float *b = new float[tSize];

		float *B;

		cudaError_t error; // error handler

		

		/* Gets the maximum number of threads and blocks */

		cudaDeviceProp prop;

		cudaGetDevice(&d);

		cudaGetDeviceProperties(&prop, d);

		nThreads = prop.maxThreadsDim[0];

		mThreads = nThreads * prop.maxGridSize[0];

		

		/* Checks if the size of the matrix is less than the maximum number of threads */

		if((tSize) < nThreads){

			Blocks = 1;

			Threads = tSize;

		}

		/* Checks if the size of the matrix is greater than the maximum number of threads */

		else if((tSize) > nThreads){

			Blocks = (tSize + nThreads - 1) / nThreads;

			Threads = nThreads;

		}

		/* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */

		else if((tSize) > mThreads){

			tSize = mThreads;

			Blocks = (tSize + nThreads - 1) / nThreads;

			Threads = nThreads;

		}

		

		dim3 dGrid(Blocks, Blocks, 1); // sets the grids

		dim3 dBlock(Threads, Threads, 1); // sets the blocks

	

		

		initializeMatrix(a,size); // initializes the matrix a

		error = cudaMalloc((void**)&A, tSize * sizeof(float)); // allocates memory on the device for matrix A;

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

    }

		error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B;

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

    }

		error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

    }

		

		/* Performs the SAT on the device on A and stores it on B */

		SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b

		cudaDeviceSynchronize(); // synchronizes the host and the device

		error = cudaGetLastError();

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

			cudaFree(a);

			cudaFree(b);

			delete [] a;

			delete [] b;

			return 3;

	  }

		/* copies device matrix B into host matrix b */

		error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost); 

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

    }

		

		//matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a

		//matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b

		

		/* deallocates space in both host memory and device memory */

		cudaFree(A);

		cudaFree(B);

		delete [] a;

		delete [] b;

		cudaDeviceReset();

		cout << "Finished" << endl;

		return 0;

	}

	else if(argc < 2){

		cout << "Please provide a size" << endl; // when no arguments are supplied

		return 0;

	}

	else{

		cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied

		return 0;

	}

}



Size
A1  (CPU)
A2  (GPU)

100
0.03
0.0034

200
0.61
0.0445

300
3.08
0.2124

400
9.66
0.6549

500
24.17
1.58

600
54.4
3.268

700
113.17
5.976