Changes

Jump to: navigation, search

Carlos

7,515 bytes added, 15:59, 15 April 2013
Progress
= Team Carlos = == Team Members == # [mailto:cjconejomolero@myseneca.ca?subject=gpu610 Carlos Conejo]== Progress ===== Assignment 1 ===For my assignment 1, I profiled a Summarized Area Table code.Here is the code:  <nowikipre>asd#include <iostream> #include <cstdlib>   using namespace std;   /* Creates the Matrice */ void createMatrice(float** a, int size){  for(int i = 0; i < size; i++)  a[i] = new float[size]; }   /* Initializes the matrice to any random number between 1 and 9 */ void initializeMatrice(float** a, int size){  float f = 1.0 / RAND_MAX;  for(int i = 0; i < size; i++)  for(int j = 0; j < size; j++)  a[i][j] = rand() * f; }   /* Creates the summarized area table */ void summarizedAreaTable(float** a, float** b, int size){  int k = 0;  float sum = 0.0;  for(int i = size-1; i >= 0; i--){  for(int j = 0; j < size; j++){  for(int k = i; k < size; k++){  for(int m = 0; m <= j; m++){  sum += a[k][m];  }  }  b[i][j] = sum;  sum = 0.0;  }  } }   int main(int argc, char* argv[]){  if(argc == 2){ // only one argument (program name + one argument) allowed  int size = atoi(argv[1]);  float **a = new float*[size];  float **b = new float*[size];    createMatrice(a,size); // creates the matrice a  createMatrice(b,size); // creates the matrice b  initializeMatrice(a,size); // initializes the matrices  summarizedAreaTable(a,b,size); // Does the SAT on a and stores it on b    cout << "Finished" << endl;  return 0;  }  else if(argc < 2)  cout << "Please provide a size" << endl; // when no arguments are supplied  else  cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied }  ---- Profiling Results for the summedAreaTable() function ------    Word Problem Seconds   250 1.50  500 25.87  750 173.99   1000 658.34   --------------------------------------------------------------------    </pre> === Assignment 2 === Here is my code to parallelize the SAT algorithm I worked on A1: <pre> #include <iostream> #include <cstdlib> #include <cuda_runtime.h>   using namespace std;   /* Initializes the matrix to any random number between 0 and 1 */ void initializeMatrix(float* a, int size){  float f = 1.0 / RAND_MAX;  for(int i = 0; i < size * size; i++)  a[i] = rand() * f; }   /* Displays the matrix */ void matrixDisplay(char matrix, float* a, int size){  int i = 0;  cout << matrix <<" is: " << endl;  while(i < size*size){  for(int j = 0; j < size; j++, i++)  cout << a[i] << " ";  cout << endl;  } }   /* Creates the Summed area table */nowiki __global__ void SummedAreaTable(float* A, float* B, int size){  int idx = blockIdx.x * blockDim.x + threadIdx.x;  float sum = 0;  int rest;    if(idx < (size*size)){  if(idx < size)  rest = idx;  else{  rest = idx - size;  while(rest >= size)  rest = rest - size;  }  for(int i = (size*size) - size + rest; i >= idx - rest; i -= size)  for(int j = i, k = rest; k >= 0; j--, k--)  sum += A[j];  B[idx] = sum;  } }   int main(int argc, char* argv[]){  if(argc == 2){ // only one argument (program name + one argument) allowed  int size = atoi(argv[1]);  int tSize = size * size;  int d;  int nThreads;  int mThreads;  int Blocks;  int Threads;  float *a = new float[tSize];  float *A;  float *b = new float[tSize];  float *B;  cudaError_t error; // error handler    /* Gets the maximum number of threads and blocks */  cudaDeviceProp prop;  cudaGetDevice(&d);  cudaGetDeviceProperties(&prop, d);  nThreads = prop.maxThreadsDim[0];  mThreads = nThreads * prop.maxGridSize[0];    /* Checks if the size of the matrix is less than the maximum number of threads */  if((tSize) < nThreads){  Blocks = 1;  Threads = tSize;  }  /* Checks if the size of the matrix is greater than the maximum number of threads */  else if((tSize) > nThreads){  Blocks = (tSize + nThreads - 1) / nThreads;  Threads = nThreads;  }  /* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */  else if((tSize) > mThreads){  tSize = mThreads;  Blocks = (tSize + nThreads - 1) / nThreads;  Threads = nThreads;  }    dim3 dGrid(Blocks, Blocks, 1); // sets the grids  dim3 dBlock(Threads, Threads, 1); // sets the blocks      initializeMatrix(a,size); // initializes the matrix a  error = cudaMalloc((void**)&A, tSize * sizeof(float)); // allocates memory on the device for matrix A;  if (error != cudaSuccess) {  cout << cudaGetErrorString(error) << endl;  }  error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B;  if (error != cudaSuccess) {  cout << cudaGetErrorString(error) << endl;  }  error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A  if (error != cudaSuccess) {  cout << cudaGetErrorString(error) << endl;  }    /* Performs the SAT on the device on A and stores it on B */  SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b  cudaDeviceSynchronize(); // synchronizes the host and the device  error = cudaGetLastError();  if (error != cudaSuccess) {  cout << cudaGetErrorString(error) << endl;  cudaFree(a);  cudaFree(b);  delete [] a;  delete [] b;  return 3;  }  /* copies device matrix B into host matrix b */  error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost);   if (error != cudaSuccess) {  cout << cudaGetErrorString(error) << endl;  }    //matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a  //matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b    /* deallocates space in both host memory and device memory */  cudaFree(A);  cudaFree(B);  delete [] a;  delete [] b;  cudaDeviceReset();  cout << "Finished" << endl;  return 0;  }  else if(argc < 2){  cout << "Please provide a size" << endl; // when no arguments are supplied  return 0;  }  else{  cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied  return 0;  } }  ---- Profiling Results for the summedAreaTable() function ------   Word Problem A1/CPU (Seconds) A2/GPU (Seconds)   100 0.03 0.0034 200 0.61 0.0445 300 3.08 0.2124 400 9.66 0.6549 500 24.17 1.58 600 54.4 3.268 700 113.17 5.976 -------------------------------------------------------------------- </pre> === Assignment 3 === Due to the difficulty of optimizing the code provided in A2 to parallelize a Summed Area Table, The professor and I accorded for me to only provide an explanation of how would I optimize my code by using a prefix sum method: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html.

Navigation menu