GPU621/Threadless Horsemen
Contents
Comparing Multi-threading in Julia vs OpenMP
Group Members
Introduction: The Julia Programming Language
- used for scientific computing
- alternative to matlab
- faster than python, not as fast c, has a repl for quick edit-run debug cycles
Julia's Forms of Parallelism
- multi-threading (our focus)
- multi-core / distributed processing (like mpi?)
- coroutines / green threads
OpenMP vs Julia Code
- Julia code is more concise than C/C++ (also simpler to initialize arrays, don't need to manage memory)
- Note that OpenMp code uses loop interchange (ikj) and Julia doesn't (ijk), which will be explained in the next section
OpenMP | Julia Multi-threading |
---|---|
#include <omp.h>
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <chrono>
#pragma once
#define N 512
typedef double array[N];
// matmul returns the product c = a * b
//
void matmul(const double a[][N], const double b[][N], double c[][N], int n) {
#pragma omp parallel for
for (int i = 0; i < n; i++) {
for (int k = 0; k < n; k++) {
for (int j = 0; j < n; j++) {
c[i][j] += a[i][k] * b[k][j];
}
}
}
}
// checksum returns the sum of the coefficients in matrix x[N][N]
//
double checksum(const double x[][N]) {
double sum = 0.0;
#pragma omp parallel for
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
sum += x[i][j];
}
}
return sum;
}
// initialize initializes matrix a[N][N]
//
void initialize(double a[][N]) {
#pragma omp parallel for
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
a[i][j] = static_cast<double>(i * j) / (N * N);
}
}
}
void reportTime(const char* msg, std::chrono::steady_clock::duration span) {
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(span);
std::cout << msg << ": " <<
ms.count() << " milliseconds" << std::endl;
}
int main(int argc, char** argv) {
if (argc != 1) {
std::cerr << "*** Incorrect number of arguments ***\n";
std::cerr << "Usage: " << argv[0] << "\n";
return 1;
}
std::cout << std::fixed << std::setprecision(2);
// allocate memory for matrices
double* a = new double[N*N];
double* b = new double[N*N];
double* c = new double[N*N];
array* aa = (array*)a;
array* bb = (array*)b;
array* cc = (array*)c;
std::cout << "Matrix Multiplication " << N << " by " << N << std::endl;
// initialize a and b
std::chrono::steady_clock::time_point ts, te;
ts = std::chrono::steady_clock::now();
initialize(aa);
initialize(bb);
double* t = c;
for (int i = 0; i < N * N; i++)
*t++ = 0.0;
te = std::chrono::steady_clock::now();
reportTime("initialization", te - ts);
// multiply matrices a and b
ts = std::chrono::steady_clock::now();
matmul(aa, bb, cc, N);
te = std::chrono::steady_clock::now();
reportTime("execution", te - ts);
std::cout << "checksum = " << checksum(cc) << std::endl;
// deallocate memory
delete[] a;
delete[] b;
delete[] c;
std::cout << "max threads: " << omp_get_max_threads() << std::endl;
} |
N = 512
function matmul(a, b, c, n)
Threads.@threads for i = 1:n
for j = 1:n
for k = 1:n
c[i,j] += a[i,k] * b[k,j]
end
end
end
return c
end
function checksum(x)
sum = 0.0
Threads.@threads for i = 1:N
for j = 1:N
sum += x[i,j]
end
end
return sum
end
# creates and returns 2d array of zeroes
function initialize()
a = zeros(Float64, N, N)
Threads.@threads for i = 1:N
for j = 1:N
a[i,j] = (convert(Float64,((i-1) * (j-1))) / convert(Float64, (N * N)))
end
end
return a
end
a = initialize();
b = initialize();
c = initialize();
start_t = time()
d = matmul(a, b, c, N)
end_t = time()
run_time = end_t - start_t
sum = checksum(d)
println("checksum = $sum")
println("matmul func time (s):")
println(run_time)
println("num threads: ")
nt = Threads.nthreads()
println(nt) |
More code here: https://github.com/tsarkarsc/parallel_prog
OpenMP vs Julia Results
- add graphs
- recap loop interchange benefits for openmp (locality of reference)
- discuss julia storing arrays as column major, loop interchange was worse for julia
- discuss different levels of optimization
Conclusion
- summary of everything