Changes

GPU621/Group 7

9,322 bytes added, 04:56, 14 April 2023

no edit summary

Data Fitting: Spline based approximation functions, derivatives, integrals and cell search

Additional examples of MKL functionality implemented in relation to real world models can be found in the optimization of various differential equations relating to finance. The Black-Scholes-Merton (BSM) model deals with pricing options contracts.

Serial

#include <omp.h>

#include <mathimf.h>

#include "euro_opt.h"

#ifdef __DO_FLOAT__

# define EXP(x) expf(x)

# define LOG(x) logf(x)

# define SQRT(x) sqrtf(x)

# define ERF(x) erff(x)

# define INVSQRT(x) 1.0f/sqrtf(x)

# define QUARTER 0.25f

# define HALF 0.5f

# define TWO 2.0f

#else

# define EXP(x) exp(x)

# define LOG(x) log(x)

# define SQRT(x) sqrt(x)

# define ERF(x) erf(x)

# define INVSQRT(x) 1.0/sqrt(x)

# define QUARTER 0.25

# define HALF 0.5

# define TWO 2.0

#endif

/*

// This function computes the Black-Scholes formula.

// Input parameters:

// nopt - length of arrays

// s0 - initial price

// x - strike price

// t - maturity

//

// Implementation assumes fixed constant parameters

// r - risk-neutral rate

// sig - volatility

//

// Output arrays for call and put prices:

// vcall, vput

//

// Note: the restrict keyword here tells the compiler

// that none of the arrays overlap in memory.

*/

void BlackScholesFormula_Compiler( int nopt,

tfloat r, tfloat sig, tfloat * restrict s0, tfloat * restrict x,

tfloat * restrict t, tfloat * restrict vcall, tfloat * restrict vput )

{

int i;

tfloat a, b, c, y, z, e;

tfloat d1, d2, w1, w2;

tfloat mr = -r;

tfloat sig_sig_two = sig * sig * TWO;

#pragma omp parallel for shared(s0, x, t, vcall, vput)

#pragma vector nontemporal (vcall, vput)

#pragma simd

for ( i = 0; i < nopt; i++ )

{

a = LOG( s0[i] / x[i] );

b = t[i] * mr;

z = t[i] * sig_sig_two;

c = QUARTER * z;

e = EXP ( b );

y = INVSQRT( z );

w1 = ( a - b + c ) * y;

w2 = ( a - b - c ) * y;

d1 = ERF( w1 );

d2 = ERF( w2 );

d1 = HALF + HALF*d1;

d2 = HALF + HALF*d2;

vcall[i] = s0[i]*d1 - x[i]*e*d2;

vput[i] = vcall[i] - s0[i] + x[i]*e;

}

</syntaxhighlight>

MKL

#include <omp.h>

#include <mkl.h>

#include "euro_opt.h"

#ifdef __DO_FLOAT__

# define VDIV(n,a,b,r) vsDiv(n,a,b,r)

# define VLOG(n,a,r) vsLn(n,a,r)

# define VEXP(n,a,r) vsExp(n,a,r)

# define VINVSQRT(n,a,r) vsInvSqrt(n,a,r)

# define VERF(n,a,r) vsErf(n,a,r)

# define QUARTER 0.25f

# define HALF 0.5f

# define TWO 2.0f

#else

# define VDIV(n,a,b,r) vdDiv(n,a,b,r)

# define VLOG(n,a,r) vdLn(n,a,r)

# define VEXP(n,a,r) vdExp(n,a,r)

# define VINVSQRT(n,a,r) vdInvSqrt(n,a,r)

# define VERF(n,a,r) vdErf(n,a,r)

# define QUARTER 0.25

# define HALF 0.5

# define TWO 2.0

#endif

#if defined _VML_ACCURACY_EP_

# define VML_ACC VML_EP

#elif defined _VML_ACCURACY_LA_

# define VML_ACC VML_LA

#elif defined _VML_ACCURACY_HA_

# define VML_ACC VML_HA

#else

# error: _VML_ACCURACY_HA_/LA/EP should be defined in makefile

#endif

/* Set the reusable buffer for intermediate results */

#if !defined NBUF

# define NBUF 1024

#endif

/*

// This function computes the Black-Scholes formula.

// Input parameters:

// nopt - length of arrays

// s0 - initial price

// x - strike price

// t - maturity

//

// Implementation assumes fixed constant parameters

// r - risk-neutral rate

// sig - volatility

//

// Output arrays for call and put prices:

// vcall, vput

//

// Note: the restrict keyword here tells the compiler

// that none of the arrays overlap in memory.

//

// Note: the implementation assumes nopt is a multiple of NBUF

*/

void BlackScholesFormula_MKL( int nopt,

tfloat r, tfloat sig, tfloat * restrict s0, tfloat * restrict x,

tfloat * restrict t, tfloat * restrict vcall, tfloat * restrict vput )

{

int i;

tfloat mr = -r;

tfloat sig_sig_two = sig * sig * TWO;

#pragma omp parallel for \

shared(s0, x, t, vcall, vput, mr, sig_sig_two, nopt) \

default(none)

for ( i = 0; i < nopt; i+= NBUF )

{

int j;

tfloat *a, *b, *c, *y, *z, *e;

tfloat *d1, *d2, *w1, *w2;

__declspec(align(ALIGN_FACTOR)) tfloat Buffer[NBUF*4];

// This computes vector length for the last iteration of the loop

// in case nopt is not exact multiple of NBUF

#define MY_MIN(x, y) ((x) < (y)) ? (x) : (y)

int nbuf = MY_MIN(NBUF, nopt - i);

a = Buffer + NBUF*0; w1 = a; d1 = w1;

c = Buffer + NBUF*1; w2 = c; d2 = w2;

b = Buffer + NBUF*2; e = b;

z = Buffer + NBUF*3; y = z;

// Must set VML accuracy in each thread

vmlSetMode( VML_ACC );

VDIV(nbuf, s0 + i, x + i, a);

VLOG(nbuf, a, a);

#pragma simd

for ( j = 0; j < nbuf; j++ )

{

b[j] = t[i + j] * mr;

a[j] = a[j] - b[j];

z[j] = t[i + j] * sig_sig_two;

c[j] = QUARTER * z[j];

}

VINVSQRT(nbuf, z, y);

VEXP(nbuf, b, e);

#pragma simd

for ( j = 0; j < nbuf; j++ )

{

tfloat aj = a[j];

tfloat cj = c[j];

w1[j] = ( aj + cj ) * y[j];

w2[j] = ( aj - cj ) * y[j];

}

VERF(nbuf, w1, d1);

VERF(nbuf, w2, d2);

#pragma simd

for ( j = 0; j < nbuf; j++ )

{

d1[j] = HALF + HALF*d1[j];

d2[j] = HALF + HALF*d2[j];

vcall[i+j] = s0[i+j]*d1[j] - x[i+j]*e[j]*d2[j];

vput[i+j] = vcall[i+j] - s0[i+j] + x[i+j]*e[j];

}

</syntaxhighlight>

== Comparisons to AOCL and OpenBLAS ==

Due to MKL being Intel's proprietary library the issue of performance arises when utilized on AMD platforms. It has been reported that MKL is slower on AMD hardware than Intel hardware, due to a check performed by MKL to determine if the processor used is Intel's or not. While this check can be circumvented, it is an extra step which may inconvenience those working on AMD systems.

AMD has its own set of libraries dealing with optimization, named AMD Optimizing CPU Libraries (AOCL). Instead of being based off of BLAS like MKL, AOCL is based off of BLIS, which stands for BLAS Like Interface Software. As can be seen from the name BLIS is functionally comparable to BLAS. The origin of BLIS dates back to the 2000s when BLAS needed to be updated due to improvements in computer architecture. BLIS focused on providing an alternative to BLAS, and as of modern times is capable of providing comparable performance to BLAS.

While Intel's MKL is regarded as one of the most efficient libraries dealing with mathematical computation, certain tests have shown AMD's AOCL can provide better performance in certain scenarios. A comparison done with Matrix-Matrix multiplication and Cholesky decomposition showed that AOCL could in some instances obtain a faster computation time than MKL. However it should be noted that there is variance between testing results based on hardware and deviation. The script used to perform the tests can be found below.

format compact

blas = version('-blas')

lapack = version('-lapack')

N=10000;

fprintf("Matrix size is %d x %d\n",N,N);

a = rand(N);

b = rand(N);

cholmat = a*a';

tic

c=a*b;

matmulTime = toc;

fprintf("Matrix Multiply time is %.2fs\n",matmulTime);

tic

c=a*b;

matmulTime = toc;

fprintf("Matrix Multiply time is %.2fs\n",matmulTime);

tic

c=a*b;

matmulTime = toc;

fprintf("Matrix Multiply time is %.2fs\n",matmulTime);

tic;

R = chol(cholmat);

cholTime = toc;

fprintf("Cholesky time is %.2fs\n",cholTime);

tic;

R = chol(cholmat);

cholTime = toc;

fprintf("Cholesky time is %.2fs\n",cholTime);

tic;

R = chol(cholmat);

cholTime = toc;

fprintf("Cholesky time is %.2fs\n",cholTime);

</syntaxhighlight>

OpenBLAS is another open source implementation of the BLAS library dealing with mathematical computation. It was designed to be a successor to GotoBLAS and function on a wide variety of platforms such as x86, ARMv8, MIPS and RISC-V. OpenBLAS is much more portable and flexible in terms of utilization across systems than MKL, however it performs noticeably slower than MKL on a variety of tests, particularly for Eigenvalue computations.

{| class="wikitable"

|+ MKL vs OpenBLAS

|-

!BLAS/LAPACK

!Ordinarly Least Squares

!Matric Multiplication

!Eigenvalue Computation

|-

|Built In

|2.294

|100+

|34.716

|-

|OpenBLAS

|1.642

|2.654

|3.190

|-

|MKL

|1.570

|2.305

|2.404

|}

Again, it should be noted that if MKL were to be running on an AMD platform without bypassing the hardware check, it would perform slower than OpenBLAS. However, while the potential decrease in performance should be acknowledged, given the ability to circumvent it the maximum possible performance should be taken into consideration for the comparison instead.

https://www.sciencedirect.com/topics/computer-science/intel-math-kernel-library

https://iq.opengenus.org/blas-vs-blis/

https://scrp.econ.cuhk.edu.hk/blog/analysis/2022/02/07/mkl-optimization.html

Jyzhang7

24

edits

Changes

GPU621/Group 7

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools