Source Code ms3 neural net
- include <iostream>
- include <vector>
- include <math.h>
- include <fstream>
- include <sstream>
- include <string>
- include <random>
- include <assert.h>
//extern "C" {
- include "cuda_runtime.h"
- include "device_launch_parameters.h"
//}
using namespace std;
void print(const vector <float>& m, int n_rows, int n_columns) {
/* "Couts" the input vector as n_rows x n_columns matrix. Inputs: m: vector, matrix of size n_rows x n_columns n_rows: int, number of rows in the left matrix m1 n_columns: int, number of columns in the left matrix m1 */
for (int i = 0; i != n_rows; ++i) { for (int j = 0; j != n_columns; ++j) { cout << m[i * n_columns + j] << " "; } cout << '\n'; } cout << endl; } vector <float> operator/(const vector <float>& m2, const float m1) {
/* Returns the product of a float and a vectors (elementwise multiplication). Inputs: m1: float m2: vector Output: vector, m1 * m2, product of two vectors m1 and m2 */
const unsigned long VECTOR_SIZE = m2.size(); vector <float> product(VECTOR_SIZE);
for (unsigned i = 0; i != VECTOR_SIZE; ++i) { product[i] = m2[i] / m1; };
return product; } vector <float> operator-(const vector <float>& m1, const vector <float>& m2) {
/* Returns the difference between two vectors. Inputs: m1: vector m2: vector Output: vector, m1 - m2, difference between two vectors m1 and m2. */
const unsigned long VECTOR_SIZE = m1.size(); vector <float> difference(VECTOR_SIZE);
for (unsigned i = 0; i != VECTOR_SIZE; ++i) { difference[i] = m1[i] - m2[i]; };
return difference; } vector <float> operator-(const vector <float>& m1, const float* m2) {
/* Returns the difference between two vectors. Inputs: m1: vector m2: vector Output: vector, m1 - m2, difference between two vectors m1 and m2. */
const unsigned long VECTOR_SIZE = m1.size(); vector <float> difference(VECTOR_SIZE);
for (unsigned i = 0; i != VECTOR_SIZE; ++i) { difference[i] = m1[i] - m2[i]; };
return difference; } vector <float> operator*(const float m1, const vector<float> m2) {
/* Returns the product of a float and a vectors (elementwise multiplication). Inputs: m1: float m2: vector Output: vector, m1 * m2, product of two vectors m1 and m2 */
const unsigned long VECTOR_SIZE = m2.size(); vector <float> product(VECTOR_SIZE);
for (unsigned i = 0; i != VECTOR_SIZE; ++i) { product[i] = m1 * m2[i]; };
return product; }
int argmax(const vector <float>& m) {
return distance(m.begin(), max_element(m.begin(), m.end())); }
static vector<float> random_vector(const int size) { random_device rd; mt19937 gen(rd()); uniform_real_distribution<> distribution(0.0, 0.05); static default_random_engine generator;
vector<float> data(size); generate(data.begin(), data.end(), [&]() { return distribution(generator); }); return data; }
vector <float> softmax(const vector <float>& z, const int dim) {
const int zsize = static_cast<int>(z.size()); vector <float> out;
for (unsigned i = 0; i != zsize; i += dim) { vector <float> foo; for (unsigned j = 0; j != dim; ++j) { foo.push_back(z[i + j]); }
float max_foo = *max_element(foo.begin(), foo.end());
for (unsigned j = 0; j != dim; ++j) { foo[j] = exp(foo[j] - max_foo); }
float sum_of_elems = 0.0; for (unsigned j = 0; j != dim; ++j) { sum_of_elems = sum_of_elems + foo[j]; }
for (unsigned j = 0; j != dim; ++j) { out.push_back(foo[j] / sum_of_elems); } } return out; }
__device__ float* ktranspose(float *m, const int C, const int R) {
/* Returns a transpose matrix of input matrix. Inputs: m: vector, input matrix C: int, number of columns in the input matrix R: int, number of rows in the input matrix Output: vector, transpose matrix mT of input matrix m */
float* mT = new float[C * R];
for (unsigned n = 0; n != C * R; n++) { unsigned i = n / C; unsigned j = n % C; mT[n] = m[R*j + i]; }
return mT; }
vector <float> dot(const vector <float>& m1, const vector <float>& m2, const int m1_rows, const int m1_columns, const int m2_columns) {
/* Returns the product of two matrices: m1 x m2. Inputs: m1: vector, left matrix of size m1_rows x m1_columns m2: vector, right matrix of size m1_columns x m2_columns (the number of rows in the right matrix must be equal to the number of the columns in the left one) m1_rows: int, number of rows in the left matrix m1 m1_columns: int, number of columns in the left matrix m1 m2_columns: int, number of columns in the right matrix m2 Output: vector, m1 * m2, product of two vectors m1 and m2, a matrix of size m1_rows x m2_columns */
vector <float> output(m1_rows*m2_columns);
for (int row = 0; row != m1_rows; ++row) { for (int col = 0; col != m2_columns; ++col) { output[row * m2_columns + col] = 0.f; for (int k = 0; k != m1_columns; ++k) { output[row * m2_columns + col] += m1[row * m1_columns + k] * m2[k * m2_columns + col]; } } }
return output; }
//version 1 dot product __global__ void kdot(const float* d_a, const float* d_b, int ni, int nj, int nk, float* d_p) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; //matrix multiplication if (i < ni && j < nj) { float sum = 0.0f; for (int k = 0; k < nk; k++) sum += d_a[i * nk + k] * d_b[k * nj + j]; d_p[i * nj + j] = sum; } //float* d_p = new float [ni*nk];
//for (int row = 0; row != ni; ++row) { // for (int col = 0; col != nk; ++col) { // d_p[row * nk + col] = 0.f; // for (int k = 0; k != nj; ++k) { // d_p[row * nk + col] += d_a[row * nj + k] * d_b[k * nk + col]; // } // } //}
//return d_p;
} void cudaCheck(cudaError_t Error) { if (Error != cudaSuccess) { cerr << cudaGetErrorName(Error) << "!"; exit(EXIT_FAILURE); } }
vector<string> split(const string &s, char delim) {
stringstream ss(s);
string item;
vector<string> tokens;
while (getline(ss, item, delim)) {
tokens.push_back(item);
}
return tokens;
}
__device__ void krelu(float* a, float* b, int n) {
for (int i = 0; i < n; ++i) {
if (a[i] < 0) {
b[i] = 0.0f;
}
else b[i] = a[i];
}
} __device__ float* kreluPrime(float* a, int n) { for (int i = 0; i < n; ++i) { if (a[i] < 0) { a[i] = 0.0f; } else a[i] = 1.0; } return a; } __device__ float* ksoftmax(float *input, size_t input_len) { assert(input);//might not be needed // assert(input_len >= 0); Not needed
float m = -INFINITY; for (size_t i = 0; i < input_len; i++) { if (input[i] > m) { m = input[i]; } }
float sum = 0.0; for (size_t i = 0; i < input_len; i++) { sum += expf(input[i] - m); }
float offset = m + logf(sum); for (size_t i = 0; i < input_len; i++) { input[i] = expf(input[i] - offset); } return input; } void get_random_fill(float* a, int size) { for (int i = 0; i < size; i++) { static std::default_random_engine e; static std::uniform_real_distribution<> dis(0, 0.5); // rage 0 - 0.5 a[i] = dis(e); } }
__global__ void train(float* d_W1, float* d_W2, float* d_W3, float* d_b_X, float* d_b_Y, float* d_a2, float* d_a1, float* d_dyhat, float* d_dW3, float* d_dW2, float* d_dW1, float* d_dz2, float* d_dz1) { int BATCH_SIZE = 256; float lr = .01 / BATCH_SIZE;
kdot<<< 50,51>>>(ktranspose(d_a2, BATCH_SIZE, 64), d_dyhat, 64, BATCH_SIZE, 10, d_dW3);
kdot << <80,32>> >(d_dyhat, ktranspose(d_W3, 64, 10), BATCH_SIZE, 10, 64, d_dz2);
kreluPrime(d_a2, 128 * 64);
for (int i = 0; i < BATCH_SIZE * 10; i++) {
d_dz2[i] = d_dz2[i] * d_a2[i];
}
kdot << <1024, 32>> >(ktranspose(d_a1, BATCH_SIZE, 128), d_dz2, 128, BATCH_SIZE, 64, d_dW2);
kdot << <512,32>> >(d_dz2, ktranspose(d_W2, 128, 64), BATCH_SIZE, 64, 128, d_dz1); kreluPrime(d_a1, BATCH_SIZE * 784); for (int i = 0; i < 256 * 64; i++) { d_dz1[i] = d_dz1[i] * d_a1[i]; }
kdot <<<512,512,32 >>>(ktranspose(d_b_X, BATCH_SIZE, 784), d_dz1, 784, BATCH_SIZE, 128, d_dW1); // Updating the parameters //W3 = W3 - lr * dW3; for (int i = 0; i < (64*10); i++) { d_W3[i] = d_W3[i] - lr * d_dW3[i]; } //W2 = W2 - lr * dW2; for (int i = 0; i < (128*64); i++) { d_W2[i] = d_W2[i] - lr * d_dW2[i]; } //W1 = W1 - lr * dW1; for (int i = 0; i < (784*128); i++) { d_W1[i] = d_W1[i] - lr * d_dW1[i]; }
} vector <float> relu(const vector <float>& z) { int size = z.size(); vector <float> output; for (int i = 0; i < size; ++i) { if (z[i] < 0) { output.push_back(0.0); } else output.push_back(z[i]); } return output; } int main(int argc, const char * argv[]) {
string line; vector<string> line_v;
cout << "Loading data ...\n"; vector<float> X_train; vector<float> y_train; ifstream myfile("train.txt"); if (myfile.is_open()) { while (getline(myfile, line)) { line_v = split(line, '\t'); int digit = strtof((line_v[0]).c_str(), 0); for (unsigned i = 0; i < 10; ++i) { if (i == digit) { y_train.push_back(1.); } else y_train.push_back(0.); }
int size = static_cast<int>(line_v.size()); for (unsigned i = 1; i < size; ++i) { X_train.push_back(strtof((line_v[i]).c_str(), 0)); } } X_train = X_train / 255.0; myfile.close(); }
else cout << "Unable to open file" << '\n';
int xsize = static_cast<int>(X_train.size()); int ysize = static_cast<int>(y_train.size());
// Some hyperparameters for the NN int BATCH_SIZE = 256; // float lr = .01 / BATCH_SIZE;
// Random initialization of the weights vector <float> W1 = random_vector(784 * 128); vector <float> W2 = random_vector(128 * 64); vector <float> W3 = random_vector(64 * 10); float* fW1 = new float[784 * 128]; float* fW2 = new float[128 * 64]; float* fW3 = new float[64 * 10];
for (int i = 0; i < (784 * 128); i++) { if (i < (128 * 64)) { if (i < (64 * 10)) { fW3[i] = W3[i]; } fW2[i] = W2[i]; } fW1[i] = W1[i]; }
float* dW1 = new float[784 * 128];
float* dW2 = new float[128 * 64];
float* dW3 = new float[64 * 10];
//allocate device memory
float* d_W1;
float* d_W2;
float* d_W3;
float* d_b_X; float* d_b_Y;
float* d_a1; float* d_a2;
float* d_dyhat;
float* d_dW3; float* d_dW2; float* d_dW1;
float* d_dz2; float* d_dz1;
//round weights
cudaMalloc((void**)&d_W1, sizeof(float) * 784 * 128);
cudaMalloc((void**)&d_W2, sizeof(float) * 128 * 64);
cudaMalloc((void**)&d_W3, sizeof(float) * 64 * 10);
//feed forward cudaMalloc((void**)&d_a1, sizeof(float) * BATCH_SIZE * 784); cudaMalloc((void**)&d_a2, sizeof(float) * BATCH_SIZE * 128);
cudaMalloc((void**)&d_dyhat, sizeof(float) * 64 * 10); //backpropagation weights cudaMalloc((void**)&d_dW3, sizeof(float) * 64 * BATCH_SIZE); cudaMalloc((void**)&d_dW2, sizeof(float) * 128 * BATCH_SIZE); cudaMalloc((void**)&d_dW1, sizeof(float) * 784 * BATCH_SIZE); cudaMalloc((void**)&d_dz2, sizeof(float) * BATCH_SIZE * 10); cudaMalloc((void**)&d_dz1, sizeof(float) * BATCH_SIZE * 64);
cudaMemcpy(d_W1, fW1, sizeof(float) * 784 * 128, cudaMemcpyHostToDevice); cudaMemcpy(d_W2, fW2, sizeof(float) * 128 * 64, cudaMemcpyHostToDevice); cudaMemcpy(d_W3, fW3, sizeof(float) * 64 * 10, cudaMemcpyHostToDevice);
cout << "Training the model ...\n"; for (unsigned i = 0; i < 10000; ++i) {
// Building batches of input variables (X) and labels (y) int randindx = rand() % (42000 - BATCH_SIZE); vector<float> b_X; vector<float> b_y; for (unsigned j = randindx * 784; j < (randindx + BATCH_SIZE) * 784; ++j) { b_X.push_back(X_train[j]); } for (unsigned k = randindx * 10; k < (randindx + BATCH_SIZE) * 10; ++k) { b_y.push_back(y_train[k]); } // Feed forward vector<float> a1 = relu(dot(b_X, W1, BATCH_SIZE, 784, 128)); vector<float> a2 = relu(dot(a1, W2, BATCH_SIZE, 128, 64)); cudaMemcpy(d_a1, &*a1.begin(), sizeof(float) * a1.size(), cudaMemcpyHostToDevice); cudaMemcpy(d_a2, &*a2.begin(), sizeof(float) * a2.size(), cudaMemcpyHostToDevice);
vector<float> yhat = softmax(dot(a2, W3, BATCH_SIZE, 64, 10), 10);
// Back propagation
vector<float> dyhat = (yhat - b_y);
cudaMemcpy(d_dyhat, dyhat.data(), sizeof(float) * dyhat.size(), cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_b_X, sizeof(float) * b_X.size());
cudaMalloc((void**)&d_b_Y, sizeof(float) * b_y.size());
cudaMemcpy(d_b_X, b_X.data(), sizeof(float) * b_X.size(), cudaMemcpyHostToDevice);
cudaMemcpy(d_b_Y, b_y.data(), sizeof(float) * b_y.size(), cudaMemcpyHostToDevice);
//parallel
train << <1, 1 >> > (d_W1, d_W2, d_W3, d_b_X, d_b_Y, d_a2, d_a1, d_dyhat, d_dW3, d_dW2, d_dW1, d_dz2, d_dz1);
if ((i + 1) % 100 == 0) { cout << "-----------------------------------------------Epoch " << i + 1 << "--------------------------------------------------" << "\n"; cout << "Predictions:" << "\n"; print(yhat, 10, 10); cout << "Ground truth:" << "\n"; print(b_y, 10, 10); vector<float> loss_m = yhat - b_y; float loss = 0.0; for (unsigned k = 0; k < BATCH_SIZE * 10; ++k) { loss += loss_m[k] * loss_m[k]; } cout << " Loss " << loss / BATCH_SIZE << "\n"; cout << "--------------------------------------------End of Epoch :(------------------------------------------------" << "\n"; }; };
//free device memory cudaFree(d_dW1); cudaFree(d_dW2); cudaFree(d_dW3); cudaFree(d_dyhat); cudaFree(d_dz1); cudaFree(d_dz2); cudaFree(d_W1); cudaFree(d_W2); cudaFree(d_W3); cudaFree(d_b_X); cudaFree(d_b_Y);
cudaDeviceReset(); delete[] dW1; delete[] dW2; delete[] dW3; return 0; }