Difference between revisions of "GPU621/Threadless Horsemen"
(→Introduction: The Julia Programming language) |
(→Comparing Multi-threading in Julia vs OpenMP) |
||
Line 15: | Line 15: | ||
* coroutines / green threads | * coroutines / green threads | ||
== OpenMP vs Julia Code == | == OpenMP vs Julia Code == | ||
− | * | + | * Julia code is more concise than C/C++ (also simpler to initialize arrays, don't need to manage memory) |
+ | * Note that OpenMp code uses loop interchange (ikj) and Julia doesn't (ijk), which will be explained in the next section | ||
+ | |||
+ | {| class="wikitable" | ||
+ | |- | ||
+ | ! OpenMP | ||
+ | ! Julia Multi-threading | ||
+ | |- | ||
+ | | | ||
+ | <source> | ||
+ | #include <omp.h> | ||
+ | #include <iostream> | ||
+ | #include <iomanip> | ||
+ | #include <cstdlib> | ||
+ | #include <chrono> | ||
+ | |||
+ | #pragma once | ||
+ | #define N 512 | ||
+ | typedef double array[N]; | ||
+ | |||
+ | // matmul returns the product c = a * b | ||
+ | // | ||
+ | void matmul(const double a[][N], const double b[][N], double c[][N], int n) { | ||
+ | |||
+ | #pragma omp parallel for | ||
+ | for (int i = 0; i < n; i++) { | ||
+ | for (int k = 0; k < n; k++) { | ||
+ | for (int j = 0; j < n; j++) { | ||
+ | c[i][j] += a[i][k] * b[k][j]; | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | |||
+ | // checksum returns the sum of the coefficients in matrix x[N][N] | ||
+ | // | ||
+ | double checksum(const double x[][N]) { | ||
+ | double sum = 0.0; | ||
+ | |||
+ | #pragma omp parallel for | ||
+ | for (int i = 0; i < N; i++) { | ||
+ | for (int j = 0; j < N; j++) { | ||
+ | sum += x[i][j]; | ||
+ | } | ||
+ | } | ||
+ | return sum; | ||
+ | } | ||
+ | |||
+ | // initialize initializes matrix a[N][N] | ||
+ | // | ||
+ | void initialize(double a[][N]) { | ||
+ | |||
+ | #pragma omp parallel for | ||
+ | for (int i = 0; i < N; i++) { | ||
+ | for (int j = 0; j < N; j++) { | ||
+ | a[i][j] = static_cast<double>(i * j) / (N * N); | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | |||
+ | void reportTime(const char* msg, std::chrono::steady_clock::duration span) { | ||
+ | auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(span); | ||
+ | std::cout << msg << ": " << | ||
+ | ms.count() << " milliseconds" << std::endl; | ||
+ | } | ||
+ | |||
+ | int main(int argc, char** argv) { | ||
+ | if (argc != 1) { | ||
+ | std::cerr << "*** Incorrect number of arguments ***\n"; | ||
+ | std::cerr << "Usage: " << argv[0] << "\n"; | ||
+ | return 1; | ||
+ | } | ||
+ | std::cout << std::fixed << std::setprecision(2); | ||
+ | |||
+ | // allocate memory for matrices | ||
+ | double* a = new double[N*N]; | ||
+ | double* b = new double[N*N]; | ||
+ | double* c = new double[N*N]; | ||
+ | array* aa = (array*)a; | ||
+ | array* bb = (array*)b; | ||
+ | array* cc = (array*)c; | ||
+ | std::cout << "Matrix Multiplication " << N << " by " << N << std::endl; | ||
+ | |||
+ | // initialize a and b | ||
+ | std::chrono::steady_clock::time_point ts, te; | ||
+ | ts = std::chrono::steady_clock::now(); | ||
+ | initialize(aa); | ||
+ | initialize(bb); | ||
+ | double* t = c; | ||
+ | for (int i = 0; i < N * N; i++) | ||
+ | *t++ = 0.0; | ||
+ | te = std::chrono::steady_clock::now(); | ||
+ | reportTime("initialization", te - ts); | ||
+ | |||
+ | // multiply matrices a and b | ||
+ | ts = std::chrono::steady_clock::now(); | ||
+ | matmul(aa, bb, cc, N); | ||
+ | te = std::chrono::steady_clock::now(); | ||
+ | reportTime("execution", te - ts); | ||
+ | std::cout << "checksum = " << checksum(cc) << std::endl; | ||
+ | |||
+ | // deallocate memory | ||
+ | delete[] a; | ||
+ | delete[] b; | ||
+ | delete[] c; | ||
+ | |||
+ | std::cout << "max threads: " << omp_get_max_threads() << std::endl; | ||
+ | } | ||
+ | </source> | ||
+ | | | ||
+ | <source> | ||
+ | N = 512 | ||
+ | |||
+ | function matmul(a, b, c, n) | ||
+ | Threads.@threads for i = 1:n | ||
+ | for j = 1:n | ||
+ | for k = 1:n | ||
+ | c[i,j] += a[i,k] * b[k,j] | ||
+ | end | ||
+ | end | ||
+ | end | ||
+ | |||
+ | return c | ||
+ | end | ||
+ | |||
+ | function checksum(x) | ||
+ | sum = 0.0 | ||
+ | |||
+ | Threads.@threads for i = 1:N | ||
+ | for j = 1:N | ||
+ | sum += x[i,j] | ||
+ | end | ||
+ | end | ||
+ | |||
+ | return sum | ||
+ | end | ||
+ | |||
+ | # creates and returns 2d array of zeroes | ||
+ | function initialize() | ||
+ | a = zeros(Float64, N, N) | ||
+ | |||
+ | Threads.@threads for i = 1:N | ||
+ | for j = 1:N | ||
+ | a[i,j] = (convert(Float64,((i-1) * (j-1))) / convert(Float64, (N * N))) | ||
+ | end | ||
+ | end | ||
+ | |||
+ | return a | ||
+ | end | ||
+ | |||
+ | a = initialize(); | ||
+ | b = initialize(); | ||
+ | c = initialize(); | ||
+ | |||
+ | start_t = time() | ||
+ | |||
+ | d = matmul(a, b, c, N) | ||
+ | |||
+ | end_t = time() | ||
+ | run_time = end_t - start_t | ||
+ | |||
+ | sum = checksum(d) | ||
+ | |||
+ | println("checksum = $sum") | ||
+ | |||
+ | println("matmul func time (s):") | ||
+ | println(run_time) | ||
+ | |||
+ | println("num threads: ") | ||
+ | nt = Threads.nthreads() | ||
+ | println(nt) | ||
+ | </source> | ||
+ | |} | ||
+ | |||
+ | More code here: https://github.com/tsarkarsc/parallel_prog | ||
+ | |||
== OpenMP vs Julia Results == | == OpenMP vs Julia Results == | ||
* add graphs | * add graphs |
Revision as of 16:27, 25 November 2018
Contents
Comparing Multi-threading in Julia vs OpenMP
Group Members
Introduction: The Julia Programming Language
- used for scientific computing
- alternative to matlab
- faster than python, not as fast c, has a repl for quick edit-run debug cycles
Julia's Forms of Parallelism
- multi-threading (our focus)
- multi-core / distributed processing (like mpi?)
- coroutines / green threads
OpenMP vs Julia Code
- Julia code is more concise than C/C++ (also simpler to initialize arrays, don't need to manage memory)
- Note that OpenMp code uses loop interchange (ikj) and Julia doesn't (ijk), which will be explained in the next section
OpenMP | Julia Multi-threading |
---|---|
#include <omp.h>
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <chrono>
#pragma once
#define N 512
typedef double array[N];
// matmul returns the product c = a * b
//
void matmul(const double a[][N], const double b[][N], double c[][N], int n) {
#pragma omp parallel for
for (int i = 0; i < n; i++) {
for (int k = 0; k < n; k++) {
for (int j = 0; j < n; j++) {
c[i][j] += a[i][k] * b[k][j];
}
}
}
}
// checksum returns the sum of the coefficients in matrix x[N][N]
//
double checksum(const double x[][N]) {
double sum = 0.0;
#pragma omp parallel for
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
sum += x[i][j];
}
}
return sum;
}
// initialize initializes matrix a[N][N]
//
void initialize(double a[][N]) {
#pragma omp parallel for
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
a[i][j] = static_cast<double>(i * j) / (N * N);
}
}
}
void reportTime(const char* msg, std::chrono::steady_clock::duration span) {
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(span);
std::cout << msg << ": " <<
ms.count() << " milliseconds" << std::endl;
}
int main(int argc, char** argv) {
if (argc != 1) {
std::cerr << "*** Incorrect number of arguments ***\n";
std::cerr << "Usage: " << argv[0] << "\n";
return 1;
}
std::cout << std::fixed << std::setprecision(2);
// allocate memory for matrices
double* a = new double[N*N];
double* b = new double[N*N];
double* c = new double[N*N];
array* aa = (array*)a;
array* bb = (array*)b;
array* cc = (array*)c;
std::cout << "Matrix Multiplication " << N << " by " << N << std::endl;
// initialize a and b
std::chrono::steady_clock::time_point ts, te;
ts = std::chrono::steady_clock::now();
initialize(aa);
initialize(bb);
double* t = c;
for (int i = 0; i < N * N; i++)
*t++ = 0.0;
te = std::chrono::steady_clock::now();
reportTime("initialization", te - ts);
// multiply matrices a and b
ts = std::chrono::steady_clock::now();
matmul(aa, bb, cc, N);
te = std::chrono::steady_clock::now();
reportTime("execution", te - ts);
std::cout << "checksum = " << checksum(cc) << std::endl;
// deallocate memory
delete[] a;
delete[] b;
delete[] c;
std::cout << "max threads: " << omp_get_max_threads() << std::endl;
} |
N = 512
function matmul(a, b, c, n)
Threads.@threads for i = 1:n
for j = 1:n
for k = 1:n
c[i,j] += a[i,k] * b[k,j]
end
end
end
return c
end
function checksum(x)
sum = 0.0
Threads.@threads for i = 1:N
for j = 1:N
sum += x[i,j]
end
end
return sum
end
# creates and returns 2d array of zeroes
function initialize()
a = zeros(Float64, N, N)
Threads.@threads for i = 1:N
for j = 1:N
a[i,j] = (convert(Float64,((i-1) * (j-1))) / convert(Float64, (N * N)))
end
end
return a
end
a = initialize();
b = initialize();
c = initialize();
start_t = time()
d = matmul(a, b, c, N)
end_t = time()
run_time = end_t - start_t
sum = checksum(d)
println("checksum = $sum")
println("matmul func time (s):")
println(run_time)
println("num threads: ")
nt = Threads.nthreads()
println(nt) |
More code here: https://github.com/tsarkarsc/parallel_prog
OpenMP vs Julia Results
- add graphs
- recap loop interchange benefits for openmp (locality of reference)
- discuss julia storing arrays as column major, loop interchange was worse for julia
- discuss different levels of optimization
Conclusion
- summary of everything