Changes

A-Team

2,880 bytes added, 05:11, 1 April 2019

→‎Back Propagation Acceleration

=== Assignment 2 ===

During assignment 2, we tried a simple kernel that took the shape of a dot product, what this achieved was nothing special, actually as predicted at the end of assignment 1, continuously calling cudaMalloc and cudaMemCpy had severe consequences on time.

====Initial implementation====

vector <float> ddot(const vector <float>& m1, const vector <float>& m2, const int m1_rows, const int m1_columns, const int m2_columns) {

cudaError_t Error = cudaSuccess;

vector<float> product;

float* h_p = new float[m1.size()];

float* h_m1 = new float[m1_rows * m1_columns];

for (int i = 0; i < m1.size; i++) {

h_m1[i] = m1[i];

}

float* h_m2 = new float[m1_rows * m2_columns];

for (int i = 0; i != m2.size; i++) {

h_m2[i] = m2[i];

}

//declare device variables

float* d_m1;

float* d_m2;

float* d_p;

Error = cudaMalloc((void**)&d_m1, m1_rows * m1_columns * sizeof(float));

if (Error != cudaSuccess) {

cerr << "Failed @ d_m1 " << cudaGetErrorName(Error) << "!";

exit(EXIT_FAILURE);

}

Error = cudaMalloc((void**)&d_m2, m1_rows * m2_columns * sizeof(float));

if (Error != cudaSuccess) {

cerr << "Failed @ d_m2 " << cudaGetErrorName(Error) << "!";

exit(EXIT_FAILURE);

}

Error = cudaMalloc((void**)&d_p, m1_rows * m1_columns * sizeof(float));

if (Error != cudaSuccess) {

cerr << "Failed @ d_p " << cudaGetErrorName(Error) << "!";

exit(EXIT_FAILURE);

}

Error = cudaMemcpy(d_m1, h_m1, m1_rows * m1_columns * sizeof(float), cudaMemcpyHostToDevice);

if (Error != cudaSuccess) {

cerr << "Failed @ Memcpy d_m1 " << cudaGetErrorName(Error) << "!";

exit(EXIT_FAILURE);

}

Error = cudaMemcpy(d_m2, h_m2, m1_rows * m2_columns * sizeof(float), cudaMemcpyHostToDevice);

if (Error != cudaSuccess) {

cerr << "Failed @ Memcpy d_m2 " << cudaGetErrorName(Error) << "!";

exit(EXIT_FAILURE);

}

//set blocks and call kernel

int width = m1_rows;

int height = m1_columns;

dim3 dBlock(32, 32);

dim3 dGrid((width + dBlock.x - 1) / dBlock.x, (height + dBlock.y - 1) / dBlock.y);

kdot << < dGrid, dBlock >> > (d_m1, d_m2, d_p, m1_rows, m1_columns, m2_columns);

if (Error != cudaSuccess) {

cerr << "Failed @ kdot function call " << cudaGetErrorName(Error) << "!";

exit(EXIT_FAILURE);

}

//copy device matrix to host matrix

Error = cudaMemcpy(h_p, d_p, m1_rows * m1_columns * sizeof(float), cudaMemcpyDeviceToHost);

if (Error != cudaSuccess) {

cerr << "Failed @ cudaMemcpy from d_p to h_p " << cudaGetErrorName(Error) << "!";

exit(EXIT_FAILURE);

}

//freeCuda & delete

//display("C = A B :", h_p, m1_rows, m1_columns);

Error = cudaFree(d_m1);

cudaCheck(Error);

Error = cudaFree(d_m2);

cudaCheck(Error);

Error = cudaFree(d_p);

cudaCheck(Error);

delete[] h_m1;

delete[] h_m2;

cudaDeviceReset();

//h_p to vector

for (int i = 0; i < (m1_rows * m1_columns); i++) {

product.push_back(h_p[i]);

}

delete[] h_p;

return product;

}

=== Assignment 3 ===

Spdjurovic

113

edits

CDOT Wiki β

Changes

A-Team

CDOT Wiki ^β