Changes

← Older edit

Carlos

5,205 bytes added, 15:59, 15 April 2013

→‎Progress

~~// sat.cpp~~ /* ~~---- Profiling Results for the summarizedAreaTable() function ------~~ ~~Word Problem Seconds~~ ~~250 1.50~~ ~~500 25.87~~ ~~750 173.99~~ ~~1000 658.34~~ ~~--------------------------------------------------------------------~~ */ <pre>

#include <iostream>

cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied

}

---- Profiling Results for the summedAreaTable() function ------

Word Problem Seconds

250 1.50

500 25.87

750 173.99

1000 658.34

--------------------------------------------------------------------

</pre>

=== Assignment 2 ===

Here is my code to parallelize the SAT algorithm I worked on A1:

<pre>

#include <iostream>

#include <cstdlib>

#include <cuda_runtime.h>

using namespace std;

/* Initializes the matrix to any random number between 0 and 1 */

void initializeMatrix(float* a, int size){

float f = 1.0 / RAND_MAX;

for(int i = 0; i < size * size; i++)

a[i] = rand() * f;

}

/* ~~To print~~ Displays the ~~results~~matrix */

void matrixDisplay(char matrix, float* a, int size){

int i = 0;

cout << matrix << "a is: " << endl;

~~for~~while(~~int i = 0;~~ i < size~~; i++~~*size){

for(int j = 0; j < size; j++, i++)

cout << a[i~~][j~~] << " ";

cout << endl;

}

} /* Creates the Summed area table */ __global__ void SummedAreaTable(float* A, float* B, int size){ int idx = blockIdx.x * blockDim.x + threadIdx.x; float sum = 0; int rest; if(idx < (size*size)){ if(idx < size) rest = idx; else{ rest = idx - size; while(rest >= size) rest = rest - size; } for(int i = (size*size) - size + rest; i >= idx - rest; i -= size) for(int j = i, k = rest; k >= 0; j--, k--) sum += A[j]; B[idx] = sum; } } int main(int argc, char* argv[]){ if(argc == 2){ // only one argument (program name + one argument) allowed int size = atoi(argv[1]); int tSize = size * size; int d; int nThreads; int mThreads; int Blocks; int Threads; float *a = new float[tSize]; float *A; float *b = new float[tSize]; float *B; cudaError_t error; // error handler /* Gets the maximum number of threads and blocks */ cudaDeviceProp prop; cudaGetDevice(&d); cudaGetDeviceProperties(&prop, d); nThreads = prop.maxThreadsDim[0]; mThreads = nThreads * prop.maxGridSize[0]; /* Checks if the size of the matrix is less than the maximum number of threads */ if((tSize) < nThreads){ Blocks = 1; Threads = tSize; } /* Checks if the size of the matrix is greater than the maximum number of threads */ else if((tSize) > nThreads){ Blocks = (tSize + nThreads - 1) / nThreads; Threads = nThreads; } /* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */ else if((tSize) > mThreads){ tSize = mThreads; Blocks = (tSize + nThreads - 1) / nThreads; Threads = nThreads; } dim3 dGrid(Blocks, Blocks, 1); // sets the grids dim3 dBlock(Threads, Threads, 1); // sets the blocks initializeMatrix(a,size); // initializes the matrix a error = cudaMalloc((void**)&A, tSize *sizeof(float)); // allocates memory on the device for matrix A; if (error != cudaSuccess) { cout << cudaGetErrorString(error) << endl; } error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B; if (error != cudaSuccess) { cout << cudaGetErrorString(error) << endl; } error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A if (error != cudaSuccess) { cout << cudaGetErrorString(error) << endl; } /* Performs the SAT on the device on A and stores it on B */ SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b cudaDeviceSynchronize(); // synchronizes the host and the device error = cudaGetLastError(); if (error != cudaSuccess) { cout << cudaGetErrorString(error) << endl; cudaFree(a); cudaFree(b); delete [] a; delete [] b; return 3; } /* copies device matrix B into host matrix b */ error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost); if (error != cudaSuccess) { cout << cudaGetErrorString(error) << endl; } //matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a //matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b /* deallocates space in both host memory and device memory */ cudaFree(A); cudaFree(B); delete [] a; delete [] b; cudaDeviceReset(); cout << "Finished" << endl; return 0; } else if(argc < 2){ cout << "Please provide a size" << endl; // when no arguments are supplied return 0; } else{ cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied return 0; } } ---- Profiling Results for the summedAreaTable() function ------ Word Problem A1/CPU (Seconds) A2/GPU (Seconds) 100 0.03 0.0034 200 0.61 0.0445 300 3.08 0.2124 400 9.66 0.6549 500 24.17 1.58 600 54.4 3.268 700 113.17 5.976 -------------------------------------------------------------------- </pre> === Assignment 3 === Due to the difficulty of optimizing the code provided in A2 to parallelize a Summed Area Table, The professor and I accorded for me to only provide an explanation of how would I optimize my code by using a prefix sum method: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html.

Carlos Javier Conejo Molero

1

edit

CDOT Wiki β

Changes

Carlos

CDOT Wiki ^β