Comparing Multi-threading in Julia vs OpenMP

Group Members

Introduction: The Julia Programming Language

used for scientific computing
alternative to matlab
faster than python, not as fast c, has a repl for quick edit-run debug cycles

Julia's Forms of Parallelism

multi-threading (our focus)
multi-core / distributed processing (like mpi?)
coroutines / green threads

OpenMP vs Julia Code

Julia code is more concise than C/C++ (also simpler to initialize arrays, don't need to manage memory)
Note that OpenMp code uses loop interchange (ikj) and Julia doesn't (ijk), which will be explained in the next section

OpenMP

Julia Multi-threading

#include <omp.h>
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <chrono>

#pragma once
#define N 512
typedef double array[N];

// matmul returns the product c = a * b
//
void matmul(const double a[][N], const double b[][N], double c[][N], int n) {
    
    #pragma omp parallel for
    for (int i = 0; i < n; i++) {
        for (int k = 0; k < n; k++) {
            for (int j = 0; j < n; j++) {
                c[i][j] += a[i][k] * b[k][j];
            }
        }
    }
}

// checksum returns the sum of the coefficients in matrix x[N][N]
//
double checksum(const double x[][N]) {
    double sum = 0.0;

    #pragma omp parallel for
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            sum += x[i][j];
        }
    }
    return sum;
}

// initialize initializes matrix a[N][N]
//
void initialize(double a[][N]) {

    #pragma omp parallel for
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            a[i][j] = static_cast<double>(i * j) / (N * N);
        }
    }
}

void reportTime(const char* msg, std::chrono::steady_clock::duration span) {
    auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(span);
    std::cout << msg << ": " <<
        ms.count() << " milliseconds" << std::endl;
}

int main(int argc, char** argv) {
    if (argc != 1) {
        std::cerr << "*** Incorrect number of arguments ***\n";
        std::cerr << "Usage: " << argv[0] << "\n";
        return 1;
    }
    std::cout << std::fixed << std::setprecision(2);

    // allocate memory for matrices
    double* a = new double[N*N];
    double* b = new double[N*N];
    double* c = new double[N*N];
    array* aa = (array*)a;
    array* bb = (array*)b;
    array* cc = (array*)c;
    std::cout << "Matrix Multiplication " << N << " by " << N << std::endl;

    // initialize a and b
    std::chrono::steady_clock::time_point ts, te;
    ts = std::chrono::steady_clock::now();
    initialize(aa);
    initialize(bb);
    double* t = c;
    for (int i = 0; i < N * N; i++)
        *t++ = 0.0;
    te = std::chrono::steady_clock::now();
    reportTime("initialization", te - ts);

    // multiply matrices a and b
    ts = std::chrono::steady_clock::now();
    matmul(aa, bb, cc, N);
    te = std::chrono::steady_clock::now();
    reportTime("execution", te - ts);
    std::cout << "checksum = " << checksum(cc) << std::endl;

    // deallocate memory
    delete[] a;
    delete[] b;
    delete[] c;

    std::cout << "max threads: " << omp_get_max_threads() << std::endl;
}

N = 512

function matmul(a, b, c, n)
    Threads.@threads for i = 1:n
        for j = 1:n
            for k = 1:n
                c[i,j] += a[i,k] * b[k,j]
            end
        end
    end

    return c
end 

function checksum(x)
    sum = 0.0

    Threads.@threads for i = 1:N
        for j = 1:N
            sum += x[i,j]
        end
    end

    return sum
end

# creates and returns 2d array of zeroes
function initialize()
    a = zeros(Float64, N, N)

    Threads.@threads for i = 1:N
        for j = 1:N
            a[i,j] = (convert(Float64,((i-1) * (j-1))) / convert(Float64, (N * N)))
        end
    end

    return a
end

a = initialize();
b = initialize();
c = initialize();

start_t = time()

d = matmul(a, b, c, N)

end_t = time()
run_time = end_t - start_t

sum = checksum(d)

println("checksum = $sum")

println("matmul func time (s):")
println(run_time)

println("num threads: ")
nt = Threads.nthreads()
println(nt)

More code here: https://github.com/tsarkarsc/parallel_prog

OpenMP vs Julia Results

add graphs
recap loop interchange benefits for openmp (locality of reference)
discuss julia storing arrays as column major, loop interchange was worse for julia
discuss different levels of optimization

Conclusion

summary of everything

GPU621/Threadless Horsemen

Contents

Comparing Multi-threading in Julia vs OpenMP

Group Members

Introduction: The Julia Programming Language

Julia's Forms of Parallelism

OpenMP vs Julia Code

OpenMP vs Julia Results

Conclusion

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools