Changes

← Older edit

A-Team

5,875 bytes added, 00:50, 8 April 2019

→‎Final Profile

}

===Dynamic Parallelism===

Dynamic Parallelism in CUDA allows for the support of kernels to create and synchronize new nested kernels. Additionally, for our use case it also allows us to spend more time on the device to process information quickly without constant cudaMemcpy() or cudaMalloc() calls.

~~===Later===~~

{| class="wikitable mw-collapsible mw-collapsed"

! ~~main~~Parent call Child kernel( .~~cpp~~.. )

|-

|

<syntaxhighlight lang="cpp">__global__ void train(float* d_W1, float* d_W2, float* d_W3, float* d_b_X, float* d_b_Y, float* d_a2, float* d_a1, float* d_yhat, float* d_dyhat, float* d_dW3, float* d_dW2, float* d_dW1, float* d_dz2, float* d_dz1, float* d_t) { int BATCH_SIZE = 256; float lr = 0.01 / BATCH_SIZE; //backpropagation d_dyhat = k_difference(d_yhat, d_b_Y, 10 * 10); kernel_dot <<<(2560 + 128)/64, 64>>> (d_dyhat, k_transpose(d_W3, 64, 10), BATCH_SIZE, 10, 64, d_dz2); cudaDeviceSynchronize();} __global__ void kernel_dot(float* d_a, float* d_b, int ni, int nj, int nk, float* d_p) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; //matrix multiplication if (i < ni && j < nj) { float sum = 0.0f; for (int k = 0; k < nk; k++) sum += d_a[i * nk + k] * d_b[k * nj + j]; d_p[i * nj + j] = sum; }}</syntaxhighlight>|} ===Final Iteration==={| class="wikitable mw-collapsible mw-collapsed"! GPU code|-|<syntaxhighlight lang="cpp">__device__ float* k_difference(const float* m1, const float* m2, const int size) { /* Returns the difference between the two vectors. */ float* difference = new float[size]; for (int i = 0; i < size; i++) { difference[i] = m1[i] - m2[i]; } return difference;}__device__ float* k_MFV(const float f, const float* m, const int size) { float* mult = new float[size]; for (int i = 0; i < size; i++) { mult[i] = f * m[i]; } return mult;}__device__ float* k_MM(float* m1, float* m2, const int m2_size) { float* product = new float[m2_size]; for (int i = 0; i != m2_size; ++i) { product[i] = m1[i] * m2[i]; }; return product;}__device__ float* k_transpose(float *m, const int C, const int R) { /* Returns a transpose matrix of input matrix. Inputs: m: vector, input matrix C: int, number of columns in the input matrix R: int, number of rows in the input matrix Output: vector, transpose matrix mT of input matrix m */ float* mT = new float[C * R]; for (unsigned n = 0; n != C * R; n++) { unsigned i = n / C; unsigned j = n % C; mT[n] = m[R*j + i]; } return mT; //for (int i = 0; i<R; ++i) // for (int j = 0; j<C; ++j) // { // mT[j * C + i] = m[i * R + j]; // } //return mT;}__device__ void dkernel_dot(float* d_a, float* d_b, int ni, int nj, int nk, float* d_p) { for (int row = 0; row != ni; ++row) { for (int col = 0; col != nk; ++col) { d_p[row * nk + col] = 0.f; for (int k = 0; k != nj; ++k) { d_p[row * nk + col] += d_a[row * nj + k] * d_b[k * nk + col]; } } }}//version 1 dot product__global__ void kernel_dot(float* d_a, float* d_b, int ni, int nj, int nk, float* d_p) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; //matrix multiplication if (i < ni && j < nj) { float sum = 0.0f; for (int k = 0; k < nk; k++) sum += d_a[i * nk + k] * d_b[k * nj + j]; d_p[i * nj + j] = sum; }}void cudaCheck(cudaError_t Error) { if (Error != cudaSuccess) { cerr << cudaGetErrorName(Error) << "!"; exit(EXIT_FAILURE); }} __device__ float* k_relu(float* a, int n) { for (int i = 0; i < n; ++i) { if (a[i] < 0) { a[i] = 0.01f; } else a[i] = a[i]; } return a;}__device__ float* k_reluPrime(float* a, int n) { for (int i = 0; i < n; ++i) { if (a[i] > 0) { a[i] = 1.0f; } else a[i] = 0.0; } return a;}///activation functions __global__ __global__ void kernel_relu(float* a, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; if(i < n) { if (a[i] < 0) { a[i] = 0.01f; } else a[i] = a[i]; }}__global__ void kernel_reluPrime(float* a, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { if (a[i] > 0) { a[i] = 1.0f; } else a[i] = 0.0; }} __device__ void ksoftmax(float *input, int input_len) { //assert(input != NULL); //assert(input_len != 0); int i; float m; /* Find maximum value from input array */ m = input[0]; for (i = 1; i < input_len; i++) { if (input[i] > m) { m = input[i]; } } float sum = 0; for (i = 0; i < input_len; i++) { sum += expf(input[i] - m); } for (i = 0; i < input_len; i++) { input[i] = expf(input[i] - m - log(sum)); } } __device__ void k_sigmoid(float* m1, int size) { /* Returns the value of the sigmoid function f(x) = 1/(1 + e^-x). Input: m1, a vector. Output: 1/(1 + e^-x) for every element of the input matrix m1. */ for (unsigned i = 0; i != size; ++i) { m1[i] = 1 / (1 + exp(-m1[i])); }}__global__ void feed_forward(float* d_b_X, float* d_W1, float* d_W2, float* d_W3, float* d_b_Y, float* d_a1, float* d_a2, float* d_yhat, float* d_dyhat) { int BATCH_SIZE = 256; float lr = 0.01 / BATCH_SIZE; float* tempY = new float[256 * 64]; //feed forward kernel_dot <<<256, 256>>> (d_b_X, d_W1, BATCH_SIZE, 784, 128, d_a1); cudaDeviceSynchronize(); k_relu(d_a1, BATCH_SIZE * 784); kernel_dot <<<256, 128>>> (d_a1, d_W2, BATCH_SIZE, 128, 64, d_a2); cudaDeviceSynchronize(); k_relu(d_a2, BATCH_SIZE * 128); kernel_dot <<<256, 64>>> (d_a2, d_W3, BATCH_SIZE, 64, 10, d_yhat); cudaDeviceSynchronize(); ksoftmax(tempY, 10 * 10); for (int i = 0; i < 100; i++) { d_yhat[i] = tempY[i]; } delete[] tempY;} __global__ void train(float* d_W1, float* d_W2, float* d_W3, float* d_b_X, float* d_b_Y, float* d_a2, float* d_a1, float* d_yhat, float* d_dyhat, float* d_dW3, float* d_dW2, float* d_dW1, float* d_dz2, float* d_dz1, float* d_t) {

cudaError_t Error;

int BATCH_SIZE = 256;

printf("\n %s \n", Error);

}

};</~~source~~syntaxhighlight>

|}

===Final Profile===

This final profile is only of 20 iterations as we had errors occur beyond 20 iterations, likely due to naive coding and bad coding practice.

[[File:nnfinalprofile.jpg]]

===Compiling===

follow the article to set up visual studios for dynamic parallelismand recommended readings: http://developer.download.nvidia.com/assets/cuda/files/CUDADownloads/TechBrief_Dynamic_Parallelism_in_CUDA.pdf http://ramblingsofagamedevstudent.blogspot.com/2014/03/set-up-visual-studio-2012-for-cuda.html

=== Assignment 3 ===

====What we would do differently:====

There are many things, one of the major ones is to take on a more manageable task, one with proper documentation and reasoning behind chosen values.

Spdjurovic

113

edits

Changes

A-Team

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools