Difference between revisions of "Carlos"
(→Team Carlos) |
(→Progress) |
||
Line 166: | Line 166: | ||
*/ | */ | ||
+ | |||
+ | === Assignment 2 === | ||
+ | |||
+ | |||
+ | #include <iostream> | ||
+ | |||
+ | #include <cstdlib> | ||
+ | |||
+ | #include <cuda_runtime.h> | ||
+ | |||
+ | |||
+ | |||
+ | using namespace std; | ||
+ | |||
+ | |||
+ | |||
+ | /* Initializes the matrix to any random number between 0 and 1 */ | ||
+ | |||
+ | void initializeMatrix(float* a, int size){ | ||
+ | |||
+ | float f = 1.0 / RAND_MAX; | ||
+ | |||
+ | for(int i = 0; i < size * size; i++) | ||
+ | |||
+ | a[i] = rand() * f; | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | /* Displays the matrix */ | ||
+ | |||
+ | void matrixDisplay(char matrix, float* a, int size){ | ||
+ | |||
+ | int i = 0; | ||
+ | |||
+ | cout << matrix <<" is: " << endl; | ||
+ | |||
+ | while(i < size*size){ | ||
+ | |||
+ | for(int j = 0; j < size; j++, i++) | ||
+ | |||
+ | cout << a[i] << " "; | ||
+ | |||
+ | cout << endl; | ||
+ | |||
+ | } | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | /* Creates the Summed area table */ | ||
+ | |||
+ | __global__ void SummedAreaTable(float* A, float* B, int size){ | ||
+ | |||
+ | int idx = blockIdx.x * blockDim.x + threadIdx.x; | ||
+ | |||
+ | float sum = 0; | ||
+ | |||
+ | int rest; | ||
+ | |||
+ | |||
+ | |||
+ | if(idx < (size*size)){ | ||
+ | |||
+ | if(idx < size) | ||
+ | |||
+ | rest = idx; | ||
+ | |||
+ | else{ | ||
+ | |||
+ | rest = idx - size; | ||
+ | |||
+ | while(rest >= size) | ||
+ | |||
+ | rest = rest - size; | ||
+ | |||
+ | } | ||
+ | |||
+ | for(int i = (size*size) - size + rest; i >= idx - rest; i -= size) | ||
+ | |||
+ | for(int j = i, k = rest; k >= 0; j--, k--) | ||
+ | |||
+ | sum += A[j]; | ||
+ | |||
+ | B[idx] = sum; | ||
+ | |||
+ | } | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | int main(int argc, char* argv[]){ | ||
+ | |||
+ | if(argc == 2){ // only one argument (program name + one argument) allowed | ||
+ | |||
+ | int size = atoi(argv[1]); | ||
+ | |||
+ | int tSize = size * size; | ||
+ | |||
+ | int d; | ||
+ | |||
+ | int nThreads; | ||
+ | |||
+ | int mThreads; | ||
+ | |||
+ | int Blocks; | ||
+ | |||
+ | int Threads; | ||
+ | |||
+ | float *a = new float[tSize]; | ||
+ | |||
+ | float *A; | ||
+ | |||
+ | float *b = new float[tSize]; | ||
+ | |||
+ | float *B; | ||
+ | |||
+ | cudaError_t error; // error handler | ||
+ | |||
+ | |||
+ | |||
+ | /* Gets the maximum number of threads and blocks */ | ||
+ | |||
+ | cudaDeviceProp prop; | ||
+ | |||
+ | cudaGetDevice(&d); | ||
+ | |||
+ | cudaGetDeviceProperties(&prop, d); | ||
+ | |||
+ | nThreads = prop.maxThreadsDim[0]; | ||
+ | |||
+ | mThreads = nThreads * prop.maxGridSize[0]; | ||
+ | |||
+ | |||
+ | |||
+ | /* Checks if the size of the matrix is less than the maximum number of threads */ | ||
+ | |||
+ | if((tSize) < nThreads){ | ||
+ | |||
+ | Blocks = 1; | ||
+ | |||
+ | Threads = tSize; | ||
+ | |||
+ | } | ||
+ | |||
+ | /* Checks if the size of the matrix is greater than the maximum number of threads */ | ||
+ | |||
+ | else if((tSize) > nThreads){ | ||
+ | |||
+ | Blocks = (tSize + nThreads - 1) / nThreads; | ||
+ | |||
+ | Threads = nThreads; | ||
+ | |||
+ | } | ||
+ | |||
+ | /* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */ | ||
+ | |||
+ | else if((tSize) > mThreads){ | ||
+ | |||
+ | tSize = mThreads; | ||
+ | |||
+ | Blocks = (tSize + nThreads - 1) / nThreads; | ||
+ | |||
+ | Threads = nThreads; | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | dim3 dGrid(Blocks, Blocks, 1); // sets the grids | ||
+ | |||
+ | dim3 dBlock(Threads, Threads, 1); // sets the blocks | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | initializeMatrix(a,size); // initializes the matrix a | ||
+ | |||
+ | error = cudaMalloc((void**)&A, tSize * sizeof(float)); // allocates memory on the device for matrix A; | ||
+ | |||
+ | if (error != cudaSuccess) { | ||
+ | |||
+ | cout << cudaGetErrorString(error) << endl; | ||
+ | |||
+ | } | ||
+ | |||
+ | error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B; | ||
+ | |||
+ | if (error != cudaSuccess) { | ||
+ | |||
+ | cout << cudaGetErrorString(error) << endl; | ||
+ | |||
+ | } | ||
+ | |||
+ | error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A | ||
+ | |||
+ | if (error != cudaSuccess) { | ||
+ | |||
+ | cout << cudaGetErrorString(error) << endl; | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | /* Performs the SAT on the device on A and stores it on B */ | ||
+ | |||
+ | SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b | ||
+ | |||
+ | cudaDeviceSynchronize(); // synchronizes the host and the device | ||
+ | |||
+ | error = cudaGetLastError(); | ||
+ | |||
+ | if (error != cudaSuccess) { | ||
+ | |||
+ | cout << cudaGetErrorString(error) << endl; | ||
+ | |||
+ | cudaFree(a); | ||
+ | |||
+ | cudaFree(b); | ||
+ | |||
+ | delete [] a; | ||
+ | |||
+ | delete [] b; | ||
+ | |||
+ | return 3; | ||
+ | |||
+ | } | ||
+ | |||
+ | /* copies device matrix B into host matrix b */ | ||
+ | |||
+ | error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost); | ||
+ | |||
+ | if (error != cudaSuccess) { | ||
+ | |||
+ | cout << cudaGetErrorString(error) << endl; | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | //matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a | ||
+ | |||
+ | //matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b | ||
+ | |||
+ | |||
+ | |||
+ | /* deallocates space in both host memory and device memory */ | ||
+ | |||
+ | cudaFree(A); | ||
+ | |||
+ | cudaFree(B); | ||
+ | |||
+ | delete [] a; | ||
+ | |||
+ | delete [] b; | ||
+ | |||
+ | cudaDeviceReset(); | ||
+ | |||
+ | cout << "Finished" << endl; | ||
+ | |||
+ | return 0; | ||
+ | |||
+ | } | ||
+ | |||
+ | else if(argc < 2){ | ||
+ | |||
+ | cout << "Please provide a size" << endl; // when no arguments are supplied | ||
+ | |||
+ | return 0; | ||
+ | |||
+ | } | ||
+ | |||
+ | else{ | ||
+ | |||
+ | cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied | ||
+ | |||
+ | return 0; | ||
+ | |||
+ | } | ||
+ | |||
+ | } |
Revision as of 11:50, 8 March 2013
Team Carlos
Team Members
Progress
Assignment 1
For my assignment 1, I profiled a Summarized Area Table code. Here is the code:
// sat.cpp
/*
Profiling Results for the summarizedAreaTable() function ------
Word Problem Seconds
250 1.50
500 25.87
750 173.99
1000 658.34
- /
- include <iostream>
- include <cstdlib>
using namespace std;
/* Creates the Matrice */
void createMatrice(float** a, int size){
for(int i = 0; i < size; i++)
a[i] = new float[size];
}
/* Initializes the matrice to any random number between 1 and 9 */
void initializeMatrice(float** a, int size){
float f = 1.0 / RAND_MAX;
for(int i = 0; i < size; i++)
for(int j = 0; j < size; j++)
a[i][j] = rand() * f;
}
/* Creates the summarized area table */
void summarizedAreaTable(float** a, float** b, int size){
int k = 0;
float sum = 0.0;
for(int i = size-1; i >= 0; i--){
for(int j = 0; j < size; j++){
for(int k = i; k < size; k++){
for(int m = 0; m <= j; m++){
sum += a[k][m];
}
}
b[i][j] = sum;
sum = 0.0;
}
}
}
int main(int argc, char* argv[]){
if(argc == 2){ // only one argument (program name + one argument) allowed
int size = atoi(argv[1]);
float **a = new float*[size];
float **b = new float*[size];
createMatrice(a,size); // creates the matrice a
createMatrice(b,size); // creates the matrice b
initializeMatrice(a,size); // initializes the matrices
summarizedAreaTable(a,b,size); // Does the SAT on a and stores it on b
cout << "Finished" << endl;
return 0;
}
else if(argc < 2)
cout << "Please provide a size" << endl; // when no arguments are supplied
else
cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied
}
/* To print the results
cout << "a is: " << endl;
for(int i = 0; i < size; i++){
for(int j = 0; j < size; j++)
cout << a[i][j] << " ";
cout << endl;
}
- /
Assignment 2
- include <iostream>
- include <cstdlib>
- include <cuda_runtime.h>
using namespace std;
/* Initializes the matrix to any random number between 0 and 1 */
void initializeMatrix(float* a, int size){
float f = 1.0 / RAND_MAX;
for(int i = 0; i < size * size; i++)
a[i] = rand() * f;
}
/* Displays the matrix */
void matrixDisplay(char matrix, float* a, int size){
int i = 0;
cout << matrix <<" is: " << endl;
while(i < size*size){
for(int j = 0; j < size; j++, i++)
cout << a[i] << " ";
cout << endl;
}
}
/* Creates the Summed area table */
__global__ void SummedAreaTable(float* A, float* B, int size){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0;
int rest;
if(idx < (size*size)){
if(idx < size)
rest = idx;
else{
rest = idx - size;
while(rest >= size)
rest = rest - size;
}
for(int i = (size*size) - size + rest; i >= idx - rest; i -= size)
for(int j = i, k = rest; k >= 0; j--, k--)
sum += A[j];
B[idx] = sum;
}
}
int main(int argc, char* argv[]){
if(argc == 2){ // only one argument (program name + one argument) allowed
int size = atoi(argv[1]);
int tSize = size * size;
int d;
int nThreads;
int mThreads;
int Blocks;
int Threads;
float *a = new float[tSize];
float *A;
float *b = new float[tSize];
float *B;
cudaError_t error; // error handler
/* Gets the maximum number of threads and blocks */
cudaDeviceProp prop;
cudaGetDevice(&d);
cudaGetDeviceProperties(&prop, d);
nThreads = prop.maxThreadsDim[0];
mThreads = nThreads * prop.maxGridSize[0];
/* Checks if the size of the matrix is less than the maximum number of threads */
if((tSize) < nThreads){
Blocks = 1;
Threads = tSize;
}
/* Checks if the size of the matrix is greater than the maximum number of threads */
else if((tSize) > nThreads){
Blocks = (tSize + nThreads - 1) / nThreads;
Threads = nThreads;
}
/* Checks if the size of the matrix is less than the maximum number of threads multipled by the maximum number of blocks */
else if((tSize) > mThreads){
tSize = mThreads;
Blocks = (tSize + nThreads - 1) / nThreads;
Threads = nThreads;
}
dim3 dGrid(Blocks, Blocks, 1); // sets the grids
dim3 dBlock(Threads, Threads, 1); // sets the blocks
initializeMatrix(a,size); // initializes the matrix a
error = cudaMalloc((void**)&A, tSize * sizeof(float)); // allocates memory on the device for matrix A;
if (error != cudaSuccess) {
cout << cudaGetErrorString(error) << endl;
}
error = cudaMalloc((void**)&B, tSize * sizeof(float)); // allocates memory on the device for matrix B;
if (error != cudaSuccess) {
cout << cudaGetErrorString(error) << endl;
}
error = cudaMemcpy(A, a, tSize * sizeof(float), cudaMemcpyHostToDevice); // copies the host matrix a into the device matrix A
if (error != cudaSuccess) {
cout << cudaGetErrorString(error) << endl;
}
/* Performs the SAT on the device on A and stores it on B */
SummedAreaTable<<<Blocks,Threads>>>(A,B,size); // Does the SAT on a and stores it on b
cudaDeviceSynchronize(); // synchronizes the host and the device
error = cudaGetLastError();
if (error != cudaSuccess) {
cout << cudaGetErrorString(error) << endl;
cudaFree(a);
cudaFree(b);
delete [] a;
delete [] b;
return 3;
}
/* copies device matrix B into host matrix b */
error = cudaMemcpy(b, B, tSize * sizeof(float), cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
cout << cudaGetErrorString(error) << endl;
}
//matrixDisplay('a', a, size); //uncomment/comment to display/not display the matrix a
//matrixDisplay('b', b, size); //uncomment/comment to display/not display the matrix b
/* deallocates space in both host memory and device memory */
cudaFree(A);
cudaFree(B);
delete [] a;
delete [] b;
cudaDeviceReset();
cout << "Finished" << endl;
return 0;
}
else if(argc < 2){
cout << "Please provide a size" << endl; // when no arguments are supplied
return 0;
}
else{
cout << "Only one size is allowed" << endl; // when more than one argument(the program name + one or more arguments) is supplied
return 0;
}
}