Changes

Jump to: navigation, search

Carlos

7,520 bytes added, 15:59, 15 April 2013
Progress
= Team Carlos =
 
== Team Members ==
# [mailto:cjconejomolero@myseneca.ca?subject=gpu610 Carlos Conejo]
== Progress ==
=== Assignment 1 ===
For my assignment 1, I profiled a Summarized Area Table code.
Here is the code:
 
 
<pre>
#include <iostream>
 
#include <cstdlib>
 
 
 
using namespace std;
 
 
 
/* Creates the Matrice */
 
void createMatrice(float** a, int size){
 
for(int i = 0; i < size; i++)
 
a[i] = new float[size];
 
}
 
 
 
/* Initializes the matrice to any random number between 1 and 9 */
 
void initializeMatrice(float** a, int size){
 
float f = 1.0 / RAND_MAX;
 
for(int i = 0; i < size; i++)
 
for(int j = 0; j < size; j++)
 
a[i][j] = rand() * f;
 
}
 
 
 
/* Creates the summarized area table */
 
void summarizedAreaTable(float** a, float** b, int size){
 
int k = 0;
 
float sum = 0.0;
 
for(int i = size-1; i >= 0; i--){
 
for(int j = 0; j < size; j++){
 
for(int k = i; k < size; k++){
 
for(int m = 0; m <= j; m++){
 
sum += a[k][m];
 
}
 
}
 
b[i][j] = sum;
 
sum = 0.0;
 
}
 
}
 
}
 
 
 
int main(int argc, char* argv[]){
 
if(argc == 2){ // only one argument (program name + one argument) allowed
 
int size = atoi(argv[1]);
 
float **a = new float*[size];
 
float **b = new float*[size];
 
 
 
createMatrice(a,size); // creates the matrice a
 
createMatrice(b,size); // creates the matrice b
 
initializeMatrice(a,size); // initializes the matrices
 
summarizedAreaTable(a,b,size); // Does the SAT on a and stores it on b
 
 
 
cout << "Finished" << endl;
 
return 0;
 
}
 
else if(argc < 2)
 
cout << "Please provide a size" << endl; // when no arguments are supplied
 
else
 
cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied
 
}
 
 
---- Profiling Results for the summedAreaTable() function ------
 
 
 
Word Problem Seconds
 
250 1.50
 
500 25.87
 
750 173.99
 
1000 658.34
 
 
--------------------------------------------------------------------
 
 
 
 
</pre>
 
=== Assignment 2 ===
 
Here is my code to parallelize the SAT algorithm I worked on A1:
 
<pre>
 
#include <iostream>
 
#include <cstdlib>
 
#include <cuda_runtime.h>
 
 
 
using namespace std;
 
 
 
/* Initializes the matrix to any random number between 0 and 1 */
 
void initializeMatrix(float* a, int size){
 
float f = 1.0 / RAND_MAX;
 
for(int i = 0; i < size * size; i++)
 
a[i] = rand() * f;
 
}
 
 
 
/* Displays the matrix */
 
void matrixDisplay(char matrix, float* a, int size){
 
int i = 0;
 
cout << matrix <<" is: " << endl;
 
while(i < size*size){
 
for(int j = 0; j < size; j++, i++)
 
cout << a[i] << " ";
 
cout << endl;
 
}
 
}
 
 
 
/* Creates the Summed area table */
 
__global__ void SummedAreaTable(float* A, float* B, int size){
 
int idx = blockIdx.x * blockDim.x + threadIdx.x;
 
float sum = 0;
 
int rest;
 
 
if(idx < (size*size)){
 
if(idx < size)
 
rest = idx;
 
else{
 
rest = idx - size;
 
while(rest >= size)
 
rest = rest - size;
 
}
 
for(int i = (size*size) - size + rest; i >= idx - rest; i -= size)
 
for(int j = i, k = rest; k >= 0; j--, k--)
 
sum += A[j];
 
B[idx] = sum;
 
}
 
}
 
 
 
int main(int argc, char* argv[]){
 
if(argc == 2){ // only one argument (program name + one argument) allowed
 
int size = atoi(argv[1]);
 
int tSize = size * size;
 
int d;
 
int nThreads;
 
int mThreads;
 
int Blocks;
 
int Threads;
 
float *a = new float[tSize];
 
float *A;
 
float *b = new float[tSize];
 
float *B;
 
cudaError_t error; // error handler
 
 
/* Gets the maximum number of threads and blocks */
 
cudaDeviceProp prop;
 
cudaGetDevice(&d);
 
cudaGetDeviceProperties(&prop, d);
 
nThreads = prop.maxThreadsDim[0];
 
mThreads = nThreads * prop.maxGridSize[0];
 
 
/* Checks if the size of the matrix is less than the maximum number of threads */
 
if((tSize) < nThreads){
 
Blocks = 1;
 
Threads = tSize;
 
}
 
/* Checks if the size of the matrix is greater than the maximum number of threads */
 
else if((tSize) > nThreads){
 
Blocks = (tSize + nThreads - 1) / nThreads;
 
Threads = nThreads;
 
}
 
/* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */
 
else if((tSize) > mThreads){
 
tSize = mThreads;
 
Blocks = (tSize + nThreads - 1) / nThreads;
 
Threads = nThreads;
 
}
 
 
dim3 dGrid(Blocks, Blocks, 1); // sets the grids
 
dim3 dBlock(Threads, Threads, 1); // sets the blocks
 
 
 
initializeMatrix(a,size); // initializes the matrix a
 
error = cudaMalloc((void**)&A, tSize * sizeof(float)); // allocates memory on the device for matrix A;
 
if (error != cudaSuccess) {
 
cout << cudaGetErrorString(error) << endl;
 
}
 
error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B;
 
if (error != cudaSuccess) {
 
cout << cudaGetErrorString(error) << endl;
 
}
 
error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A
 
if (error != cudaSuccess) {
 
cout << cudaGetErrorString(error) << endl;
 
}
 
 
/* Performs the SAT on the device on A and stores it on B */
 
SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b
 
cudaDeviceSynchronize(); // synchronizes the host and the device
 
error = cudaGetLastError();
 
if (error != cudaSuccess) {
 
cout << cudaGetErrorString(error) << endl;
 
cudaFree(a);
 
cudaFree(b);
 
delete [] a;
 
delete [] b;
 
return 3;
 
}
 
/* copies device matrix B into host matrix b */
 
error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost);
 
if (error != cudaSuccess) {
 
cout << cudaGetErrorString(error) << endl;
 
}
 
 
//matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a
 
//matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b
 
 
/* deallocates space in both host memory and device memory */
 
cudaFree(A);
 
cudaFree(B);
 
delete [] a;
 
delete [] b;
 
cudaDeviceReset();
 
cout << "Finished" << endl;
 
return 0;
 
}
 
else if(argc < 2){
 
cout << "Please provide a size" << endl; // when no arguments are supplied
 
return 0;
 
}
 
else{
 
cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied
 
return 0;
 
}
 
}
 
 
---- Profiling Results for the summedAreaTable() function ------
 
 
 
Word Problem A1/CPU (Seconds) A2/GPU (Seconds)
 
100 0.03 0.0034
200 0.61 0.0445
300 3.08 0.2124
400 9.66 0.6549
500 24.17 1.58
600 54.4 3.268
700 113.17 5.976
 
--------------------------------------------------------------------
 
</pre>
 
=== Assignment 3 ===
 
Due to the difficulty of optimizing the code provided in A2 to parallelize a Summed Area Table, The professor and I accorded for me to only provide an explanation of how would I optimize my code by using a prefix sum method: http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html.

Navigation menu