Latest revision as of 15:59, 15 April 2013

Team Carlos

Team Members

Carlos Conejo

Progress

Assignment 1

For my assignment 1, I profiled a Summarized Area Table code. Here is the code:

#include <iostream>

#include <cstdlib>



using namespace std;



/* Creates the Matrice */

void createMatrice(float** a, int size){

	for(int i = 0; i < size; i++)

		a[i] = new float[size];

}



/* Initializes the matrice to any random number between 1 and 9 */

void initializeMatrice(float** a, int size){

	float f = 1.0 / RAND_MAX;

	for(int i = 0; i < size; i++)

		for(int j = 0; j < size; j++)

			a[i][j] = rand() * f;

}



/* Creates the summarized area table */

void summarizedAreaTable(float** a, float** b, int size){

	int k = 0;

	float sum = 0.0;

	for(int i = size-1; i >= 0; i--){

		for(int j = 0; j < size; j++){

			for(int k = i; k < size; k++){

				for(int m = 0; m <= j; m++){

					sum += a[k][m];

				}

			}

			b[i][j] = sum;

			sum = 0.0;

		}

	}

}



int main(int argc, char* argv[]){

	if(argc == 2){ // only one argument (program name + one argument) allowed

		int size = atoi(argv[1]);

		float **a = new float*[size];

		float **b = new float*[size];



		createMatrice(a,size); // creates the matrice a

		createMatrice(b,size); // creates the matrice b

		initializeMatrice(a,size); // initializes the matrices

		summarizedAreaTable(a,b,size); // Does the SAT on a and stores it on b



		cout << "Finished" << endl;

		return 0;

	}

	else if(argc < 2)

		cout << "Please provide a size" << endl; // when no arguments are supplied

	else

		cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied

}


---- Profiling Results for the summedAreaTable() function ------



		Word Problem                          Seconds      

	            250                                  1.50

	            500                                 25.87

		    750                                173.99 

		   1000	                               658.34

   

--------------------------------------------------------------------

Assignment 2

Here is my code to parallelize the SAT algorithm I worked on A1:

#include <iostream>

#include <cstdlib>

#include <cuda_runtime.h>



using namespace std;



/* Initializes the matrix to any random number between 0 and 1 */

void initializeMatrix(float* a, int size){

	float f = 1.0 / RAND_MAX;

	for(int i = 0; i < size * size; i++)

			a[i] = rand() * f;

}



/* Displays the matrix */

void matrixDisplay(char matrix, float* a, int size){

	int i = 0;

	cout << matrix <<" is: " << endl;

	while(i < size*size){

		for(int j = 0; j < size; j++, i++)

			cout << a[i] << " ";

		cout << endl;

	}

}



/* Creates the Summed area table */

__global__ void SummedAreaTable(float* A, float* B, int size){

	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	float sum = 0;

	int rest;

	

	if(idx < (size*size)){

		if(idx < size)

			rest = idx;

		else{

			rest = idx - size;

			while(rest >= size)

				rest = rest - size;

		}

		for(int i = (size*size) - size + rest; i >= idx - rest; i -= size)

			for(int j = i, k = rest; k >= 0; j--, k--)

				sum += A[j];

		B[idx] = sum;

	}

}



int main(int argc, char* argv[]){

	if(argc == 2){ // only one argument (program name + one argument) allowed

		int size = atoi(argv[1]);

		int tSize = size * size;

		int d;

		int nThreads;

		int mThreads;

		int Blocks;

		int Threads;

		float *a = new float[tSize];

		float *A;

		float *b = new float[tSize];

		float *B;

		cudaError_t error; // error handler

		

		/* Gets the maximum number of threads and blocks */

		cudaDeviceProp prop;

		cudaGetDevice(&d);

		cudaGetDeviceProperties(&prop, d);

		nThreads = prop.maxThreadsDim[0];

		mThreads = nThreads * prop.maxGridSize[0];

		

		/* Checks if the size of the matrix is less than the maximum number of threads */

		if((tSize) < nThreads){

			Blocks = 1;

			Threads = tSize;

		}

		/* Checks if the size of the matrix is greater than the maximum number of threads */

		else if((tSize) > nThreads){

			Blocks = (tSize + nThreads - 1) / nThreads;

			Threads = nThreads;

		}

		/* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */

		else if((tSize) > mThreads){

			tSize = mThreads;

			Blocks = (tSize + nThreads - 1) / nThreads;

			Threads = nThreads;

		}

		

		dim3 dGrid(Blocks, Blocks, 1); // sets the grids

		dim3 dBlock(Threads, Threads, 1); // sets the blocks

	

		

		initializeMatrix(a,size); // initializes the matrix a

		error = cudaMalloc((void**)&A, tSize * sizeof(float)); // allocates memory on the device for matrix A;

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

    }

		error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B;

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

    }

		error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

    }

		

		/* Performs the SAT on the device on A and stores it on B */

		SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b

		cudaDeviceSynchronize(); // synchronizes the host and the device

		error = cudaGetLastError();

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

			cudaFree(a);

			cudaFree(b);

			delete [] a;

			delete [] b;

			return 3;

	  }

		/* copies device matrix B into host matrix b */

		error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost); 

		if (error != cudaSuccess) {

			cout << cudaGetErrorString(error) << endl;

    }

		

		//matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a

		//matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b

		

		/* deallocates space in both host memory and device memory */

		cudaFree(A);

		cudaFree(B);

		delete [] a;

		delete [] b;

		cudaDeviceReset();

		cout << "Finished" << endl;

		return 0;

	}

	else if(argc < 2){

		cout << "Please provide a size" << endl; // when no arguments are supplied

		return 0;

	}

	else{

		cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied

		return 0;

	}

}


---- Profiling Results for the summedAreaTable() function ------



Word Problem          A1/CPU (Seconds)           A2/GPU (Seconds)      

    100                   0.03                        0.0034
    200                   0.61                        0.0445
    300                   3.08                        0.2124
    400                   9.66                        0.6549
    500                  24.17                        1.58
    600                  54.4                         3.268
    700                 113.17                        5.976

--------------------------------------------------------------------

Assignment 3

Due to the difficulty of optimizing the code provided in A2 to parallelize a Summed Area Table, The professor and I accorded for me to only provide an explanation of how would I optimize my code by using a prefix sum method: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html.

Difference between revisions of "Carlos"

Latest revision as of 15:59, 15 April 2013

Contents

Team Carlos

Team Members

Progress

Assignment 1

Assignment 2

Assignment 3

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools

@@ Line 1: / Line 1: @@
-= Team Omega (XVI)=
+= Team Carlos =
-== Project Marking Percentage ==
-* due immediately
-<big>
- Group work:      25%        (25 <= xx <= 50)
- Individual work: 75% +      (50 <= xx <= 75)
- -------------------------
- Total           100%
-</big>
-== Repository ==
+== Team Members ==
-* repo Github id: XVI-Omega
+# [mailto:cjconejomolero@myseneca.ca?subject=gpu610 Carlos Conejo]
-* repo Github URL: [https://github.com/Seneca-OOP344/XVI-Omega XVI-Omega]
+== Progress ==
+=== Assignment 1 ===
+For my assignment 1, I profiled a Summarized Area Table code.
+Here is the code:
-== Team Members ==
-{| class="wikitable sortable" border="1" cellpadding="5"
-|+ Omega (Team XVI)
-! First Name !! Last Name  !! Section !! Seneca Id !! Github ID !! wiki id !!  IRC nick !! Blog URL
-|-
-|[[User:Carlos Javier Conejo Molero|Carlos]]||Conejo||A||[mailto:cjconejomolero@myseneca.ca cjconejomolero]||carlosjavi3r|| [[Special:Contributions/Carlos Javier Conejo Molero|Carlos Conejo]]||Carlos||[http://www.blog.ca/user/carlosjavi3r/ Carlos's Blog]
-|-
-|[[User:csho3 |Christopher ]]|| Ho || A || [mailto:csho@myseneca.ca?subject=oop344 csho3] || [http://github.com/chhoris Chhoris] || [[Special:Contributions/csho3 | csho3]] || csho3 ||[http://csho3.wordpress.com/ Chris's Blog]
-|-
-<!-- use this to add rows to this table:
-|[[User:WikiID | FristName]]|| LastName || Section || [mailto:YourSenecaEmailID@myseneca.ca?subject=oop344 SenecaEmailID] || Github:[http://github.com/GithubID GithubID] || [[Special:Contributions/WikiID | WikiID]] || IrcNick || [http://yourBlogURL BlogName]
-|-
--->
+<pre>
+#include <iostream>
-|}
+#include <cstdlib>
-==Issues and Status ==
-=== V_1AddConsoleClass ===
-* Assigned to: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Code review by: [mailto:hgnguyen1@myseneca.ca Hao Gia Nguyen]
-* Status: Done
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_2.1CFieldClass ===
-* Assigned to: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Code review by: [mailto:rdasilvarodrigues@myseneca.ca Rafael Rodrigues]
-* Status: Done
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_2.2CLabelClass ===
-* Assigned to: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Code review by: [mailto:csho3@myseneca.ca Christopher Ho]
-* Status: Done
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_2.3CDialogClass ===
-* Assigned to: [mailto:rdasilvarodrigues@myseneca.ca Rafael Rodrigues]
-* Code review by: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Status: being developed
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_2.4CLineEditClass ===
-* Assigned to: [mailto:rdasilvarodrigues@myseneca.ca Rafael Rodrigues]
-* Code review by: [mailto:hgnguyen1@myseneca.ca Hao Gia Nguyen]
-* Status: Done
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_2.5CbuttonClass ===
-* Assigned to: [mailto:csho3@myseneca.ca Christopher Ho]
-* Code review by: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Status: Done
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_2.6ValEditClass ===
+using namespace std;
-* Assigned to: [mailto:csho3@myseneca.ca Christopher Ho]
-* Code review by: [mailto:hgnguyen1@myseneca.ca Hao Gia Nguyen]
-* Status: Done
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_2.7CCheckMarkClass ===
-* Assigned to: [mailto:csho3@myseneca.ca Christopher Ho]
-* Code review by: [mailto:rdasilvarodrigues@myseneca.ca Rafael Rodrigues]
-* Status: Done
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_2.8.1AddCTextClass ===
-* Assigned to: [mailto:hgnguyen1@myseneca.ca Hao Gia Nguyen]
-* Code review by: [mailto:rdasilvarodrigues@myseneca.ca Rafael Rodrigues]
-* Status: Done
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_2.8.2CTextClass ===
-* Assigned to: [mailto:hgnguyen1@myseneca.ca Hao Gia Nguyen]
-* Code review by: [mailto:csho3@myseneca.ca Christopher Ho]
-* Status: Done
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_2.9CCheckListClass ===
-* Assigned to: [mailto:hgnguyen1@myseneca.ca Hao Gia Nguyen]
-* Code review by: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Status: Done
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_3.1CField ===
-* Assigned to: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Code review by: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Status: pushed to master
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_3.2CDialog ===
-* Assigned to: [mailto:csho3@myseneca.ca Christopher Ho]
-* Code review by: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Status: pushed to master
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_3.3CLabel===
-* Assigned to: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Code review by: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Status: pushed to master
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_3.4CLineEdit===
-* Assigned to: Using object file
-* Code review by: Using object file
-* Status: pushed to master
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_4.1CButton===
-* Assigned to: Using object file
-* Code review by: Using object file
-* Status: pushed to master
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_4.2CVLineEdit===
-* Assigned to: Using object file
-* Code review by: Using object file
-* Status: pushed to master
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_4.3CCheckMark===
-* Assigned to: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Code review by: [mailto:cjconejomolero@myseneca.ca Carlos Conejo]
-* Status: pushed to master
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_6.1CText===
-* Assigned to: Using object file
-* Code review by: Using object file
-* Status: pushed to master
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== V_6.2CCheckList===
-* Assigned to: Using object file
-* Code review by: Using object file
-* Status: pushed to master
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-=== Application ===
-* Assigned to: [mailto:cjconejomolero@myseneca.ca Carlos Conejo] & [mailto:csho3@myseneca.ca Christopher Ho]
-* Code review by: [mailto:cjconejomolero@myseneca.ca Carlos Conejo] & [mailto:csho3@myseneca.ca Christopher Ho]
-* Status: being developed
-*: (being developed/pull request/being reviewed/pushed to master)
-* comments:
-*:
-== Coding Rules ==
-. Tabs: 2 Spaces
-. Line up the arguments when they are too large (1 more tab)
+/* Creates the Matrice */
-. When Defining use this format "_NAME_", example: #define _CFIELD_
+void createMatrice(float** a, int size){
-. The opening Bracket on the same line of the "opener" e.g if(asd = "asd"){
+	for(int i = 0; i < size; i++)
-. At the beggining of each file write a comment with the name of the file e.g. //cfield.cpp
+		a[i] = new float[size];
-. Put comments for every function
+}
-. Only 1 line declares one variable
-== meetings ==
-* latest will be on top
-# [[(Irc)logs of meeting number ### - oop344 20113 | Thursday, 08-11 at 4:00pm - Seneca's Library]]
-# [[(Irc)logs of meeting number ### - oop344 20113 | Friday, 09-11 at 5:20pm (After OOP's Class) - Outside the Clasroom ]]
-== discussions ==
+/* Initializes the matrice to any random number between 1 and 9 */
+void initializeMatrice(float** a, int size){
+	float f = 1.0 / RAND_MAX;
+	for(int i = 0; i < size; i++)
+		for(int j = 0; j < size; j++)
+			a[i][j] = rand() * f;
+}
+/* Creates the summarized area table */
+void summarizedAreaTable(float** a, float** b, int size){
+	int k = 0;
+	float sum = 0.0;
+	for(int i = size-1; i >= 0; i--){
+		for(int j = 0; j < size; j++){
+			for(int k = i; k < size; k++){
+				for(int m = 0; m <= j; m++){
+					sum += a[k][m];
+				}
+			}
+			b[i][j] = sum;
+			sum = 0.0;
+		}
+	}
+}
+int main(int argc, char* argv[]){
+	if(argc == 2){ // only one argument (program name + one argument) allowed
+		int size = atoi(argv[1]);
+		float **a = new float*[size];
+		float **b = new float*[size];
+		createMatrice(a,size); // creates the matrice a
+		createMatrice(b,size); // creates the matrice b
+		initializeMatrice(a,size); // initializes the matrices
+		summarizedAreaTable(a,b,size); // Does the SAT on a and stores it on b
+		cout << "Finished" << endl;
+		return 0;
+	}
+	else if(argc < 2)
+		cout << "Please provide a size" << endl; // when no arguments are supplied
+	else
+		cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied
+}
+---- Profiling Results for the summedAreaTable() function ------
+		Word Problem                          Seconds
+                                  1.50
+                                 25.87
+                                173.99
+	                               658.34
+--------------------------------------------------------------------
+</pre>
+=== Assignment 2 ===
+Here is my code to parallelize the SAT algorithm I worked on A1:
+<pre>
+#include <iostream>
+#include <cstdlib>
+#include <cuda_runtime.h>
+using namespace std;
+/* Initializes the matrix to any random number between 0 and 1 */
+void initializeMatrix(float* a, int size){
+	float f = 1.0 / RAND_MAX;
+	for(int i = 0; i < size * size; i++)
+			a[i] = rand() * f;
+}
+/* Displays the matrix */
+void matrixDisplay(char matrix, float* a, int size){
+	int i = 0;
+	cout << matrix <<" is: " << endl;
+	while(i < size*size){
+		for(int j = 0; j < size; j++, i++)
+			cout << a[i] << " ";
+		cout << endl;
+	}
+}
+/* Creates the Summed area table */
+__global__ void SummedAreaTable(float* A, float* B, int size){
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	float sum = 0;
+	int rest;
+	if(idx < (size*size)){
+		if(idx < size)
+			rest = idx;
+		else{
+			rest = idx - size;
+			while(rest >= size)
+				rest = rest - size;
+		}
+		for(int i = (size*size) - size + rest; i >= idx - rest; i -= size)
+			for(int j = i, k = rest; k >= 0; j--, k--)
+				sum += A[j];
+		B[idx] = sum;
+	}
+}
+int main(int argc, char* argv[]){
+	if(argc == 2){ // only one argument (program name + one argument) allowed
+		int size = atoi(argv[1]);
+		int tSize = size * size;
+		int d;
+		int nThreads;
+		int mThreads;
+		int Blocks;
+		int Threads;
+		float *a = new float[tSize];
+		float *A;
+		float *b = new float[tSize];
+		float *B;
+		cudaError_t error; // error handler
+		/* Gets the maximum number of threads and blocks */
+		cudaDeviceProp prop;
+		cudaGetDevice(&d);
+		cudaGetDeviceProperties(&prop, d);
+		nThreads = prop.maxThreadsDim[0];
+		mThreads = nThreads * prop.maxGridSize[0];
+		/* Checks if the size of the matrix is less than the maximum number of threads */
+		if((tSize) < nThreads){
+			Blocks = 1;
+			Threads = tSize;
+		}
+		/* Checks if the size of the matrix is greater than the maximum number of threads */
+		else if((tSize) > nThreads){
+			Blocks = (tSize + nThreads - 1) / nThreads;
+			Threads = nThreads;
+		}
+		/* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */
+		else if((tSize) > mThreads){
+			tSize = mThreads;
+			Blocks = (tSize + nThreads - 1) / nThreads;
+			Threads = nThreads;
+		}
+		dim3 dGrid(Blocks, Blocks, 1); // sets the grids
+		dim3 dBlock(Threads, Threads, 1); // sets the blocks
+		initializeMatrix(a,size); // initializes the matrix a
+		error = cudaMalloc((void**)&A, tSize * sizeof(float)); // allocates memory on the device for matrix A;
+		if (error != cudaSuccess) {
+			cout << cudaGetErrorString(error) << endl;
+    }
+		error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B;
+		if (error != cudaSuccess) {
+			cout << cudaGetErrorString(error) << endl;
+    }
+		error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A
+		if (error != cudaSuccess) {
+			cout << cudaGetErrorString(error) << endl;
+    }
+		/* Performs the SAT on the device on A and stores it on B */
+		SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b
+		cudaDeviceSynchronize(); // synchronizes the host and the device
+		error = cudaGetLastError();
+		if (error != cudaSuccess) {
+			cout << cudaGetErrorString(error) << endl;
+			cudaFree(a);
+			cudaFree(b);
+			delete [] a;
+			delete [] b;
+			return 3;
+	  }
+		/* copies device matrix B into host matrix b */
+		error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost);
+		if (error != cudaSuccess) {
+			cout << cudaGetErrorString(error) << endl;
+    }
+		//matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a
+		//matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b
+		/* deallocates space in both host memory and device memory */
+		cudaFree(A);
+		cudaFree(B);
+		delete [] a;
+		delete [] b;
+		cudaDeviceReset();
+		cout << "Finished" << endl;
+		return 0;
+	}
+	else if(argc < 2){
+		cout << "Please provide a size" << endl; // when no arguments are supplied
+		return 0;
+	}
+	else{
+		cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied
+		return 0;
+	}
+}
+---- Profiling Results for the summedAreaTable() function ------
+Word Problem          A1/CPU (Seconds)           A2/GPU (Seconds)
+                   0.03                        0.0034
+                   0.61                        0.0445
+                   3.08                        0.2124
+                   9.66                        0.6549
+                  24.17                        1.58
+                  54.4                         3.268
+                 113.17                        5.976
+--------------------------------------------------------------------
+</pre>
+=== Assignment 3 ===
+Due to the difficulty of optimizing the code provided in A2 to parallelize a Summed Area Table, The professor and I accorded for me to only provide an explanation of how would I optimize my code by using a prefix sum method: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html.