Difference between revisions of "Carlos"
(→Issues and Status) |
(→Progress) |
||
(11 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
− | = Team | + | = Team Carlos = |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | == | + | == Team Members == |
− | + | # [mailto:cjconejomolero@myseneca.ca?subject=gpu610 Carlos Conejo] | |
− | + | == Progress == | |
+ | === Assignment 1 === | ||
+ | For my assignment 1, I profiled a Summarized Area Table code. | ||
+ | Here is the code: | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
+ | <pre> | ||
+ | #include <iostream> | ||
− | + | #include <cstdlib> | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | using namespace std; | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | /* Creates the Matrice */ | |
− | + | void createMatrice(float** a, int size){ | |
− | + | for(int i = 0; i < size; i++) | |
− | + | a[i] = new float[size]; | |
− | + | } | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | == | + | /* Initializes the matrice to any random number between 1 and 9 */ |
+ | |||
+ | void initializeMatrice(float** a, int size){ | ||
+ | |||
+ | float f = 1.0 / RAND_MAX; | ||
+ | |||
+ | for(int i = 0; i < size; i++) | ||
+ | |||
+ | for(int j = 0; j < size; j++) | ||
+ | |||
+ | a[i][j] = rand() * f; | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | /* Creates the summarized area table */ | ||
+ | |||
+ | void summarizedAreaTable(float** a, float** b, int size){ | ||
+ | |||
+ | int k = 0; | ||
+ | |||
+ | float sum = 0.0; | ||
+ | |||
+ | for(int i = size-1; i >= 0; i--){ | ||
+ | |||
+ | for(int j = 0; j < size; j++){ | ||
+ | |||
+ | for(int k = i; k < size; k++){ | ||
+ | |||
+ | for(int m = 0; m <= j; m++){ | ||
+ | |||
+ | sum += a[k][m]; | ||
+ | |||
+ | } | ||
+ | |||
+ | } | ||
+ | |||
+ | b[i][j] = sum; | ||
+ | |||
+ | sum = 0.0; | ||
+ | |||
+ | } | ||
+ | |||
+ | } | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | int main(int argc, char* argv[]){ | ||
+ | |||
+ | if(argc == 2){ // only one argument (program name + one argument) allowed | ||
+ | |||
+ | int size = atoi(argv[1]); | ||
+ | |||
+ | float **a = new float*[size]; | ||
+ | |||
+ | float **b = new float*[size]; | ||
+ | |||
+ | |||
+ | |||
+ | createMatrice(a,size); // creates the matrice a | ||
+ | |||
+ | createMatrice(b,size); // creates the matrice b | ||
+ | |||
+ | initializeMatrice(a,size); // initializes the matrices | ||
+ | |||
+ | summarizedAreaTable(a,b,size); // Does the SAT on a and stores it on b | ||
+ | |||
+ | |||
+ | |||
+ | cout << "Finished" << endl; | ||
+ | |||
+ | return 0; | ||
+ | |||
+ | } | ||
+ | |||
+ | else if(argc < 2) | ||
+ | |||
+ | cout << "Please provide a size" << endl; // when no arguments are supplied | ||
+ | |||
+ | else | ||
+ | |||
+ | cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | ---- Profiling Results for the summedAreaTable() function ------ | ||
+ | |||
+ | |||
+ | |||
+ | Word Problem Seconds | ||
+ | |||
+ | 250 1.50 | ||
+ | |||
+ | 500 25.87 | ||
+ | |||
+ | 750 173.99 | ||
+ | |||
+ | 1000 658.34 | ||
+ | |||
+ | |||
+ | |||
+ | -------------------------------------------------------------------- | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | </pre> | ||
+ | |||
+ | === Assignment 2 === | ||
+ | |||
+ | Here is my code to parallelize the SAT algorithm I worked on A1: | ||
+ | |||
+ | <pre> | ||
+ | |||
+ | #include <iostream> | ||
+ | |||
+ | #include <cstdlib> | ||
+ | |||
+ | #include <cuda_runtime.h> | ||
+ | |||
+ | |||
+ | |||
+ | using namespace std; | ||
+ | |||
+ | |||
+ | |||
+ | /* Initializes the matrix to any random number between 0 and 1 */ | ||
+ | |||
+ | void initializeMatrix(float* a, int size){ | ||
+ | |||
+ | float f = 1.0 / RAND_MAX; | ||
+ | |||
+ | for(int i = 0; i < size * size; i++) | ||
+ | |||
+ | a[i] = rand() * f; | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | /* Displays the matrix */ | ||
+ | |||
+ | void matrixDisplay(char matrix, float* a, int size){ | ||
+ | |||
+ | int i = 0; | ||
+ | |||
+ | cout << matrix <<" is: " << endl; | ||
+ | |||
+ | while(i < size*size){ | ||
+ | |||
+ | for(int j = 0; j < size; j++, i++) | ||
+ | |||
+ | cout << a[i] << " "; | ||
+ | |||
+ | cout << endl; | ||
+ | |||
+ | } | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | /* Creates the Summed area table */ | ||
+ | |||
+ | __global__ void SummedAreaTable(float* A, float* B, int size){ | ||
+ | |||
+ | int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
+ | |||
+ | float sum = 0; | ||
+ | |||
+ | int rest; | ||
+ | |||
+ | |||
+ | |||
+ | if(idx < (size*size)){ | ||
+ | |||
+ | if(idx < size) | ||
+ | |||
+ | rest = idx; | ||
+ | |||
+ | else{ | ||
+ | |||
+ | rest = idx - size; | ||
+ | |||
+ | while(rest >= size) | ||
+ | |||
+ | rest = rest - size; | ||
+ | |||
+ | } | ||
+ | |||
+ | for(int i = (size*size) - size + rest; i >= idx - rest; i -= size) | ||
+ | |||
+ | for(int j = i, k = rest; k >= 0; j--, k--) | ||
+ | |||
+ | sum += A[j]; | ||
+ | |||
+ | B[idx] = sum; | ||
+ | |||
+ | } | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | int main(int argc, char* argv[]){ | ||
+ | |||
+ | if(argc == 2){ // only one argument (program name + one argument) allowed | ||
+ | |||
+ | int size = atoi(argv[1]); | ||
+ | |||
+ | int tSize = size * size; | ||
+ | |||
+ | int d; | ||
+ | |||
+ | int nThreads; | ||
+ | |||
+ | int mThreads; | ||
+ | |||
+ | int Blocks; | ||
+ | |||
+ | int Threads; | ||
+ | |||
+ | float *a = new float[tSize]; | ||
+ | |||
+ | float *A; | ||
+ | |||
+ | float *b = new float[tSize]; | ||
+ | |||
+ | float *B; | ||
+ | |||
+ | cudaError_t error; // error handler | ||
+ | |||
+ | |||
+ | |||
+ | /* Gets the maximum number of threads and blocks */ | ||
+ | |||
+ | cudaDeviceProp prop; | ||
+ | |||
+ | cudaGetDevice(&d); | ||
+ | |||
+ | cudaGetDeviceProperties(&prop, d); | ||
+ | |||
+ | nThreads = prop.maxThreadsDim[0]; | ||
+ | |||
+ | mThreads = nThreads * prop.maxGridSize[0]; | ||
+ | |||
+ | |||
+ | |||
+ | /* Checks if the size of the matrix is less than the maximum number of threads */ | ||
+ | |||
+ | if((tSize) < nThreads){ | ||
+ | |||
+ | Blocks = 1; | ||
+ | |||
+ | Threads = tSize; | ||
+ | |||
+ | } | ||
+ | |||
+ | /* Checks if the size of the matrix is greater than the maximum number of threads */ | ||
+ | |||
+ | else if((tSize) > nThreads){ | ||
+ | |||
+ | Blocks = (tSize + nThreads - 1) / nThreads; | ||
+ | |||
+ | Threads = nThreads; | ||
+ | |||
+ | } | ||
+ | |||
+ | /* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */ | ||
+ | |||
+ | else if((tSize) > mThreads){ | ||
+ | |||
+ | tSize = mThreads; | ||
+ | |||
+ | Blocks = (tSize + nThreads - 1) / nThreads; | ||
+ | |||
+ | Threads = nThreads; | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | dim3 dGrid(Blocks, Blocks, 1); // sets the grids | ||
+ | |||
+ | dim3 dBlock(Threads, Threads, 1); // sets the blocks | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | initializeMatrix(a,size); // initializes the matrix a | ||
+ | |||
+ | error = cudaMalloc((void**)&A, tSize * sizeof(float)); // allocates memory on the device for matrix A; | ||
+ | |||
+ | if (error != cudaSuccess) { | ||
+ | |||
+ | cout << cudaGetErrorString(error) << endl; | ||
+ | |||
+ | } | ||
+ | |||
+ | error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B; | ||
+ | |||
+ | if (error != cudaSuccess) { | ||
+ | |||
+ | cout << cudaGetErrorString(error) << endl; | ||
+ | |||
+ | } | ||
+ | |||
+ | error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A | ||
+ | |||
+ | if (error != cudaSuccess) { | ||
+ | |||
+ | cout << cudaGetErrorString(error) << endl; | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | /* Performs the SAT on the device on A and stores it on B */ | ||
+ | |||
+ | SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b | ||
+ | |||
+ | cudaDeviceSynchronize(); // synchronizes the host and the device | ||
+ | |||
+ | error = cudaGetLastError(); | ||
+ | |||
+ | if (error != cudaSuccess) { | ||
+ | |||
+ | cout << cudaGetErrorString(error) << endl; | ||
+ | |||
+ | cudaFree(a); | ||
+ | |||
+ | cudaFree(b); | ||
+ | |||
+ | delete [] a; | ||
+ | |||
+ | delete [] b; | ||
+ | |||
+ | return 3; | ||
+ | |||
+ | } | ||
+ | |||
+ | /* copies device matrix B into host matrix b */ | ||
+ | |||
+ | error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost); | ||
+ | |||
+ | if (error != cudaSuccess) { | ||
+ | |||
+ | cout << cudaGetErrorString(error) << endl; | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | //matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a | ||
+ | |||
+ | //matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b | ||
+ | |||
+ | |||
+ | |||
+ | /* deallocates space in both host memory and device memory */ | ||
+ | |||
+ | cudaFree(A); | ||
+ | |||
+ | cudaFree(B); | ||
+ | |||
+ | delete [] a; | ||
+ | |||
+ | delete [] b; | ||
+ | |||
+ | cudaDeviceReset(); | ||
+ | |||
+ | cout << "Finished" << endl; | ||
+ | |||
+ | return 0; | ||
+ | |||
+ | } | ||
+ | |||
+ | else if(argc < 2){ | ||
+ | |||
+ | cout << "Please provide a size" << endl; // when no arguments are supplied | ||
+ | |||
+ | return 0; | ||
+ | |||
+ | } | ||
+ | |||
+ | else{ | ||
+ | |||
+ | cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied | ||
+ | |||
+ | return 0; | ||
+ | |||
+ | } | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | ---- Profiling Results for the summedAreaTable() function ------ | ||
+ | |||
+ | |||
+ | |||
+ | Word Problem A1/CPU (Seconds) A2/GPU (Seconds) | ||
+ | |||
+ | 100 0.03 0.0034 | ||
+ | 200 0.61 0.0445 | ||
+ | 300 3.08 0.2124 | ||
+ | 400 9.66 0.6549 | ||
+ | 500 24.17 1.58 | ||
+ | 600 54.4 3.268 | ||
+ | 700 113.17 5.976 | ||
+ | |||
+ | -------------------------------------------------------------------- | ||
+ | |||
+ | </pre> | ||
+ | |||
+ | === Assignment 3 === | ||
+ | |||
+ | Due to the difficulty of optimizing the code provided in A2 to parallelize a Summed Area Table, The professor and I accorded for me to only provide an explanation of how would I optimize my code by using a prefix sum method: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html. |
Latest revision as of 14:59, 15 April 2013
Contents
Team Carlos
Team Members
Progress
Assignment 1
For my assignment 1, I profiled a Summarized Area Table code. Here is the code:
#include <iostream> #include <cstdlib> using namespace std; /* Creates the Matrice */ void createMatrice(float** a, int size){ for(int i = 0; i < size; i++) a[i] = new float[size]; } /* Initializes the matrice to any random number between 1 and 9 */ void initializeMatrice(float** a, int size){ float f = 1.0 / RAND_MAX; for(int i = 0; i < size; i++) for(int j = 0; j < size; j++) a[i][j] = rand() * f; } /* Creates the summarized area table */ void summarizedAreaTable(float** a, float** b, int size){ int k = 0; float sum = 0.0; for(int i = size-1; i >= 0; i--){ for(int j = 0; j < size; j++){ for(int k = i; k < size; k++){ for(int m = 0; m <= j; m++){ sum += a[k][m]; } } b[i][j] = sum; sum = 0.0; } } } int main(int argc, char* argv[]){ if(argc == 2){ // only one argument (program name + one argument) allowed int size = atoi(argv[1]); float **a = new float*[size]; float **b = new float*[size]; createMatrice(a,size); // creates the matrice a createMatrice(b,size); // creates the matrice b initializeMatrice(a,size); // initializes the matrices summarizedAreaTable(a,b,size); // Does the SAT on a and stores it on b cout << "Finished" << endl; return 0; } else if(argc < 2) cout << "Please provide a size" << endl; // when no arguments are supplied else cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied } ---- Profiling Results for the summedAreaTable() function ------ Word Problem Seconds 250 1.50 500 25.87 750 173.99 1000 658.34 --------------------------------------------------------------------
Assignment 2
Here is my code to parallelize the SAT algorithm I worked on A1:
#include <iostream> #include <cstdlib> #include <cuda_runtime.h> using namespace std; /* Initializes the matrix to any random number between 0 and 1 */ void initializeMatrix(float* a, int size){ float f = 1.0 / RAND_MAX; for(int i = 0; i < size * size; i++) a[i] = rand() * f; } /* Displays the matrix */ void matrixDisplay(char matrix, float* a, int size){ int i = 0; cout << matrix <<" is: " << endl; while(i < size*size){ for(int j = 0; j < size; j++, i++) cout << a[i] << " "; cout << endl; } } /* Creates the Summed area table */ __global__ void SummedAreaTable(float* A, float* B, int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; float sum = 0; int rest; if(idx < (size*size)){ if(idx < size) rest = idx; else{ rest = idx - size; while(rest >= size) rest = rest - size; } for(int i = (size*size) - size + rest; i >= idx - rest; i -= size) for(int j = i, k = rest; k >= 0; j--, k--) sum += A[j]; B[idx] = sum; } } int main(int argc, char* argv[]){ if(argc == 2){ // only one argument (program name + one argument) allowed int size = atoi(argv[1]); int tSize = size * size; int d; int nThreads; int mThreads; int Blocks; int Threads; float *a = new float[tSize]; float *A; float *b = new float[tSize]; float *B; cudaError_t error; // error handler /* Gets the maximum number of threads and blocks */ cudaDeviceProp prop; cudaGetDevice(&d); cudaGetDeviceProperties(&prop, d); nThreads = prop.maxThreadsDim[0]; mThreads = nThreads * prop.maxGridSize[0]; /* Checks if the size of the matrix is less than the maximum number of threads */ if((tSize) < nThreads){ Blocks = 1; Threads = tSize; } /* Checks if the size of the matrix is greater than the maximum number of threads */ else if((tSize) > nThreads){ Blocks = (tSize + nThreads - 1) / nThreads; Threads = nThreads; } /* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */ else if((tSize) > mThreads){ tSize = mThreads; Blocks = (tSize + nThreads - 1) / nThreads; Threads = nThreads; } dim3 dGrid(Blocks, Blocks, 1); // sets the grids dim3 dBlock(Threads, Threads, 1); // sets the blocks initializeMatrix(a,size); // initializes the matrix a error = cudaMalloc((void**)&A, tSize * sizeof(float)); // allocates memory on the device for matrix A; if (error != cudaSuccess) { cout << cudaGetErrorString(error) << endl; } error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B; if (error != cudaSuccess) { cout << cudaGetErrorString(error) << endl; } error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A if (error != cudaSuccess) { cout << cudaGetErrorString(error) << endl; } /* Performs the SAT on the device on A and stores it on B */ SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b cudaDeviceSynchronize(); // synchronizes the host and the device error = cudaGetLastError(); if (error != cudaSuccess) { cout << cudaGetErrorString(error) << endl; cudaFree(a); cudaFree(b); delete [] a; delete [] b; return 3; } /* copies device matrix B into host matrix b */ error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost); if (error != cudaSuccess) { cout << cudaGetErrorString(error) << endl; } //matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a //matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b /* deallocates space in both host memory and device memory */ cudaFree(A); cudaFree(B); delete [] a; delete [] b; cudaDeviceReset(); cout << "Finished" << endl; return 0; } else if(argc < 2){ cout << "Please provide a size" << endl; // when no arguments are supplied return 0; } else{ cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied return 0; } } ---- Profiling Results for the summedAreaTable() function ------ Word Problem A1/CPU (Seconds) A2/GPU (Seconds) 100 0.03 0.0034 200 0.61 0.0445 300 3.08 0.2124 400 9.66 0.6549 500 24.17 1.58 600 54.4 3.268 700 113.17 5.976 --------------------------------------------------------------------
Assignment 3
Due to the difficulty of optimizing the code provided in A2 to parallelize a Summed Area Table, The professor and I accorded for me to only provide an explanation of how would I optimize my code by using a prefix sum method: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html.