113
edits
Changes
A-Team
,→Later
|
<syntaxhighlight lang="cpp">
__device__ float* k_difference(const float* m1, const float* m2, const int size) { /* Returns the difference between the two vectors. */ float* difference = new float[size]; for (int i = 0; i < size; i++) { difference[i] = m1[i] - m2[i]; } return difference;}__device__ float* k_MFV(const float f, const float* m, const int size) { float* mult = new float[size]; for (int i = 0; i < size; i++) { mult[i] = f * m[i]; } return mult;}__device__ float* k_MM(float* m1, float* m2, const int m2_size) { float* product = new float[m2_size]; for (int i = 0; i != m2_size; ++i) { product[i] = m1[i] * m2[i]; }; return product;}__device__ float* k_transpose(float *m, const int C, const int R) { /* Returns a transpose matrix of input matrix. Inputs: m: vector, input matrix C: int, number of columns in the input matrix R: int, number of rows in the input matrix Output: vector, transpose matrix mT of input matrix m */ float* mT = new float[C * R]; for (unsigned n = 0; n != C * R; n++) { unsigned i = n / C; unsigned j = n % C; mT[n] = m[R*j + i]; } return mT; //for (int i = 0; i<R; ++i) // for (int j = 0; j<C; ++j) // { // mT[j * C + i] = m[i * R + j]; // } //return mT;}__device__ void dkernel_dot(float* d_a, float* d_b, int ni, int nj, int nk, float* d_p) { for (int row = 0; row != ni; ++row) { for (int col = 0; col != nk; ++col) { d_p[row * nk + col] = 0.f; for (int k = 0; k != nj; ++k) { d_p[row * nk + col] += d_a[row * nj + k] * d_b[k * nk + col]; } } }}//version 1 dot product__global__ void kernel_dot(float* d_a, float* d_b, int ni, int nj, int nk, float* d_p) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; //matrix multiplication if (i < ni && j < nj) { float sum = 0.0f; for (int k = 0; k < nk; k++) sum += d_a[i * nk + k] * d_b[k * nj + j]; d_p[i * nj + j] = sum; }}void cudaCheck(cudaError_t Error) { if (Error != cudaSuccess) { cerr << cudaGetErrorName(Error) << "!"; exit(EXIT_FAILURE); }} __device__ float* k_relu(float* a, int n) { for (int i = 0; i < n; ++i) { if (a[i] < 0) { a[i] = 0.01f; } else a[i] = a[i]; } return a;}__device__ float* k_reluPrime(float* a, int n) { for (int i = 0; i < n; ++i) { if (a[i] > 0) { a[i] = 1.0f; } else a[i] = 0.0; } return a;}///activation functions __global__ __global__ void kernel_relu(float* a, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; if(i < n) { if (a[i] < 0) { a[i] = 0.01f; } else a[i] = a[i]; }}__global__ void kernel_reluPrime(float* a, int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { if (a[i] > 0) { a[i] = 1.0f; } else a[i] = 0.0; }} __device__ void ksoftmax(float *input, int input_len) { //assert(input != NULL); //assert(input_len != 0); int i; float m; /* Find maximum value from input array */ m = input[0]; for (i = 1; i < input_len; i++) { if (input[i] > m) { m = input[i]; } } float sum = 0; for (i = 0; i < input_len; i++) { sum += expf(input[i] - m); } for (i = 0; i < input_len; i++) { input[i] = expf(input[i] - m - log(sum)); } } __device__ void k_sigmoid(float* m1, int size) { /* Returns the value of the sigmoid function f(x) = 1/(1 + e^-x). Input: m1, a vector. Output: 1/(1 + e^-x) for every element of the input matrix m1. */ for (unsigned i = 0; i != size; ++i) { m1[i] = 1 / (1 + exp(-m1[i])); }}__global__ void feed_forward(float* d_b_X, float* d_W1, float* d_W2, float* d_W3, float* d_b_Y, float* d_a1, float* d_a2, float* d_yhat, float* d_dyhat) { int BATCH_SIZE = 256; float lr = 0.01 / BATCH_SIZE; float* tempY = new float[256 * 64]; //feed forward kernel_dot <<<256, 256>>> (d_b_X, d_W1, BATCH_SIZE, 784, 128, d_a1); cudaDeviceSynchronize(); k_relu(d_a1, BATCH_SIZE * 784); kernel_dot <<<256, 128>>> (d_a1, d_W2, BATCH_SIZE, 128, 64, d_a2); cudaDeviceSynchronize(); k_relu(d_a2, BATCH_SIZE * 128); kernel_dot <<<256, 64>>> (d_a2, d_W3, BATCH_SIZE, 64, 10, d_yhat); cudaDeviceSynchronize(); ksoftmax(tempY, 10 * 10); for (int i = 0; i < 100; i++) { d_yhat[i] = tempY[i]; } delete[] tempY;} __global__ void train(float* d_W1, float* d_W2, float* d_W3, float* d_b_X, float* d_b_Y, float* d_a2, float* d_a1, float* d_yhat, float* d_dyhat, float* d_dW3, float* d_dW2, float* d_dW1, float* d_dz2, float* d_dz1, float* d_t) {
cudaError_t Error;
int BATCH_SIZE = 256;
printf("\n %s \n", Error);
}
</syntaxhighlight>
|}