113
edits
Changes
A-Team
,→Back Propagation Acceleration
=== Assignment 2 ===
During assignment 2, we tried a simple kernel that took the shape of a dot product, what this achieved was nothing special, actually as predicted at the end of assignment 1, continuously calling cudaMalloc and cudaMemCpy had severe consequences on time.
====Initial implementation====
vector <float> ddot(const vector <float>& m1, const vector <float>& m2, const int m1_rows, const int m1_columns, const int m2_columns) {
cudaError_t Error = cudaSuccess;
vector<float> product;
float* h_p = new float[m1.size()];
float* h_m1 = new float[m1_rows * m1_columns];
for (int i = 0; i < m1.size; i++) {
h_m1[i] = m1[i];
}
float* h_m2 = new float[m1_rows * m2_columns];
for (int i = 0; i != m2.size; i++) {
h_m2[i] = m2[i];
}
//declare device variables
float* d_m1;
float* d_m2;
float* d_p;
Error = cudaMalloc((void**)&d_m1, m1_rows * m1_columns * sizeof(float));
if (Error != cudaSuccess) {
cerr << "Failed @ d_m1 " << cudaGetErrorName(Error) << "!";
exit(EXIT_FAILURE);
}
Error = cudaMalloc((void**)&d_m2, m1_rows * m2_columns * sizeof(float));
if (Error != cudaSuccess) {
cerr << "Failed @ d_m2 " << cudaGetErrorName(Error) << "!";
exit(EXIT_FAILURE);
}
Error = cudaMalloc((void**)&d_p, m1_rows * m1_columns * sizeof(float));
if (Error != cudaSuccess) {
cerr << "Failed @ d_p " << cudaGetErrorName(Error) << "!";
exit(EXIT_FAILURE);
}
Error = cudaMemcpy(d_m1, h_m1, m1_rows * m1_columns * sizeof(float), cudaMemcpyHostToDevice);
if (Error != cudaSuccess) {
cerr << "Failed @ Memcpy d_m1 " << cudaGetErrorName(Error) << "!";
exit(EXIT_FAILURE);
}
Error = cudaMemcpy(d_m2, h_m2, m1_rows * m2_columns * sizeof(float), cudaMemcpyHostToDevice);
if (Error != cudaSuccess) {
cerr << "Failed @ Memcpy d_m2 " << cudaGetErrorName(Error) << "!";
exit(EXIT_FAILURE);
}
//set blocks and call kernel
int width = m1_rows;
int height = m1_columns;
dim3 dBlock(32, 32);
dim3 dGrid((width + dBlock.x - 1) / dBlock.x, (height + dBlock.y - 1) / dBlock.y);
kdot << < dGrid, dBlock >> > (d_m1, d_m2, d_p, m1_rows, m1_columns, m2_columns);
if (Error != cudaSuccess) {
cerr << "Failed @ kdot function call " << cudaGetErrorName(Error) << "!";
exit(EXIT_FAILURE);
}
//copy device matrix to host matrix
Error = cudaMemcpy(h_p, d_p, m1_rows * m1_columns * sizeof(float), cudaMemcpyDeviceToHost);
if (Error != cudaSuccess) {
cerr << "Failed @ cudaMemcpy from d_p to h_p " << cudaGetErrorName(Error) << "!";
exit(EXIT_FAILURE);
}
//freeCuda & delete
//display("C = A B :", h_p, m1_rows, m1_columns);
Error = cudaFree(d_m1);
cudaCheck(Error);
Error = cudaFree(d_m2);
cudaCheck(Error);
Error = cudaFree(d_p);
cudaCheck(Error);
delete[] h_m1;
delete[] h_m2;
cudaDeviceReset();
//h_p to vector
for (int i = 0; i < (m1_rows * m1_columns); i++) {
product.push_back(h_p[i]);
}
delete[] h_p;
return product;
}
=== Assignment 3 ===