Difference between revisions of "Team Lion F2017"
Sgupta7857 (talk | contribs) (→Basic hotspot analysis) |
Jsbhamber2 (talk | contribs) (→Concurrency) |
||
Line 47: | Line 47: | ||
====matmul_0 (Serial)==== | ====matmul_0 (Serial)==== | ||
+ | <pre> | ||
+ | double matmul_0(const double* a, const double* b, double* c, int n) { | ||
+ | for (int i = 0; i < n; i++) { | ||
+ | for (int j = 0; j < n; j++) { | ||
+ | double sum = 0.0; | ||
+ | for (int k = 0; k < n; k++) | ||
+ | sum += a[i * n + k] * b[k * n + j]; | ||
+ | c[i * n + j] = sum; | ||
+ | } | ||
+ | } | ||
+ | double diag = 0.0; | ||
+ | for (int i = 0; i < n; i++) | ||
+ | diag += c[i * n + i]; | ||
+ | return diag; | ||
+ | } | ||
+ | </pre> | ||
[[File:Conc-01.png]] | [[File:Conc-01.png]] | ||
Line 53: | Line 69: | ||
====matmul_1 (Serial with j-k loops reversed)==== | ====matmul_1 (Serial with j-k loops reversed)==== | ||
+ | <pre> | ||
+ | double matmul_1(const double* a, const double* b, double* c, int n) { | ||
+ | |||
+ | for (int i = 0; i < n; i++) { | ||
+ | for (int k = 0; k < n; k++) { | ||
+ | double sum = 0.0; | ||
+ | for (int j = 0; j < n; j++) | ||
+ | sum += a[i * n + k] * b[k * n + j]; | ||
+ | c[i * n + k] = sum; | ||
+ | } | ||
+ | } | ||
+ | double diag = 0.0; | ||
+ | for (int i = 0; i < n; i++) | ||
+ | diag += c[i * n + i]; | ||
+ | return diag; | ||
+ | } | ||
+ | </pre> | ||
[[File:Conc-11.png]] | [[File:Conc-11.png]] | ||
Line 59: | Line 92: | ||
====matmul_2 (Cilk Plus with cilk_for)==== | ====matmul_2 (Cilk Plus with cilk_for)==== | ||
+ | <pre> | ||
+ | double matmul_2(const double* a, const double* b, double* c, int n) { | ||
+ | |||
+ | cilk_for (int i = 0; i < n; i++) { | ||
+ | cilk_for (int j = 0; j < n; j++) { | ||
+ | double sum = 0.0; | ||
+ | for(int k = 0; k < n; k++) { | ||
+ | sum += a[i * n + k] * b[k * n + j]; | ||
+ | } | ||
+ | c[i * n + j] = sum; | ||
+ | } | ||
+ | } | ||
+ | |||
+ | double diag = 0.0; | ||
+ | for (int i = 0; i < n; i++) | ||
+ | diag += c[i * n + i]; | ||
+ | return diag; | ||
+ | } | ||
+ | </pre> | ||
[[File:Conc-21.png]] | [[File:Conc-21.png]] | ||
Line 65: | Line 117: | ||
====matmul_3 (+array notation, reducer)==== | ====matmul_3 (+array notation, reducer)==== | ||
+ | <pre> | ||
+ | double matmul_3(const double* a, const double* b, double* c, int n) { | ||
+ | |||
+ | cilk_for(int i = 0; i < n; i++) { | ||
+ | cilk_for(int j = 0; j < n; j++) { | ||
+ | double sum = 0.0; | ||
+ | for (int k = 0; k < n; k++) { | ||
+ | sum += a[i * n + k] * b[k * n + j]; | ||
+ | } | ||
+ | c[i * n + j] = sum; | ||
+ | } | ||
+ | } | ||
+ | |||
+ | cilk::reducer_opadd <double> diag(0.0); | ||
+ | cilk_for(int i = 0; i < n; i++) { | ||
+ | diag += c[i * n + i]; | ||
+ | } | ||
+ | return diag.get_value(); | ||
+ | } | ||
+ | </pre> | ||
[[File:Conc-31.png]] | [[File:Conc-31.png]] | ||
Line 71: | Line 143: | ||
====matmul_4 (+vectorization)==== | ====matmul_4 (+vectorization)==== | ||
+ | <pre> | ||
+ | double matmul_4(const double* a, const double* b, double* c, int n) { | ||
+ | |||
+ | cilk_for(int i = 0; i < n; i++) { | ||
+ | cilk_for(int j = 0; j < n; j++) { | ||
+ | double sum = 0.0; | ||
+ | #pragma simd | ||
+ | for (int k = 0; k < n; k++) { | ||
+ | sum += a[i * n + k] * b[k * n + j]; | ||
+ | } | ||
+ | c[i * n + j] = sum; | ||
+ | } | ||
+ | } | ||
+ | |||
+ | cilk::reducer_opadd <double> diag(0.0); | ||
+ | cilk_for(int i = 0; i < n; i++) { | ||
+ | diag += c[i * n + i]; | ||
+ | } | ||
+ | return diag.get_value(); | ||
+ | } | ||
+ | </pre> | ||
[[File:Conc-41.png]] | [[File:Conc-41.png]] | ||
[[File:Conc-42.png]] | [[File:Conc-42.png]] | ||
+ | |||
+ | ====Final test with all functions==== | ||
+ | |||
+ | |||
+ | [[File:Conc-51.png]] | ||
+ | [[File:Conc-52.png]] | ||
+ | |||
+ | [[File:Conc-53.png]] | ||
====Final test with all functions==== | ====Final test with all functions==== |
Revision as of 11:15, 5 January 2018
Contents
- 1 Group Members
- 1.1 What is VTune Amplifier?
- 1.2 How to use it?
- 1.3 Hotspots
- 1.4 Parallelism
- 1.4.1 Concurrency
- 1.4.1.1 Results of Concurrency tests on Workshop 6
- 1.4.1.2 matmul_0 (Serial)
- 1.4.1.3 matmul_1 (Serial with j-k loops reversed)
- 1.4.1.4 matmul_2 (Cilk Plus with cilk_for)
- 1.4.1.5 matmul_3 (+array notation, reducer)
- 1.4.1.6 matmul_4 (+vectorization)
- 1.4.1.7 Final test with all functions
- 1.4.1.8 Final test with all functions
- 1.4.2 Locks & Waits
- 1.4.3 HPC Performance Characterization
- 1.4.1 Concurrency
- 1.5 Microarchitecture
- 1.6 references
Group Members
Intel Parallel Studio vTune Amplifier
What is VTune Amplifier?
- A tool created by Intel to provide performance analysis on software.
- Offers both a GUI and command-line version for both Windows and Linux
- GUI only for OSX
- Basic features available on both Intel and AMD processors, but advanced features only for Intel
How to use it?
- Available as a standalone unit or part of the following packages:
- Intel Parallel Studio XE Cluster Edition and Professional Edition
- Intel Media Server Studio Professional Edition
- Intel System Studio
Can be run on a local machine
Hotspots
Basic hotspot analysis
We used our workshop 6 as an example to demonstrate this particular aspect of Intel Vtune Amplifer
Advanced hotspot analysis
Parallelism
Concurrency
- Best for visualizing thread parallelism on available cores, finding areas with high or low concurrency, and identifying serial bottlenecks in your code
- Provides information on how many threads were running at each moment during application execution
- Includes threads which are currently running or ready to run and therefore are not waiting at a defined waiting or blocking API
- Also shows CPU time while the hotspot was executing and estimates its effectiveness either by CPU usage or by Threads Concurrency
Results of Concurrency tests on Workshop 6
I ran the Concurrency test on each of the functions in Workshop 6. I isolated each function by commenting out all others, then ran them 1 by 1. This was to get an idea of how they perform on their own. Finally I ran them all together to see how the program runs overall.
matmul_0 (Serial)
double matmul_0(const double* a, const double* b, double* c, int n) { for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { double sum = 0.0; for (int k = 0; k < n; k++) sum += a[i * n + k] * b[k * n + j]; c[i * n + j] = sum; } } double diag = 0.0; for (int i = 0; i < n; i++) diag += c[i * n + i]; return diag; }
matmul_1 (Serial with j-k loops reversed)
double matmul_1(const double* a, const double* b, double* c, int n) { for (int i = 0; i < n; i++) { for (int k = 0; k < n; k++) { double sum = 0.0; for (int j = 0; j < n; j++) sum += a[i * n + k] * b[k * n + j]; c[i * n + k] = sum; } } double diag = 0.0; for (int i = 0; i < n; i++) diag += c[i * n + i]; return diag; }
matmul_2 (Cilk Plus with cilk_for)
double matmul_2(const double* a, const double* b, double* c, int n) { cilk_for (int i = 0; i < n; i++) { cilk_for (int j = 0; j < n; j++) { double sum = 0.0; for(int k = 0; k < n; k++) { sum += a[i * n + k] * b[k * n + j]; } c[i * n + j] = sum; } } double diag = 0.0; for (int i = 0; i < n; i++) diag += c[i * n + i]; return diag; }
matmul_3 (+array notation, reducer)
double matmul_3(const double* a, const double* b, double* c, int n) { cilk_for(int i = 0; i < n; i++) { cilk_for(int j = 0; j < n; j++) { double sum = 0.0; for (int k = 0; k < n; k++) { sum += a[i * n + k] * b[k * n + j]; } c[i * n + j] = sum; } } cilk::reducer_opadd <double> diag(0.0); cilk_for(int i = 0; i < n; i++) { diag += c[i * n + i]; } return diag.get_value(); }
matmul_4 (+vectorization)
double matmul_4(const double* a, const double* b, double* c, int n) { cilk_for(int i = 0; i < n; i++) { cilk_for(int j = 0; j < n; j++) { double sum = 0.0; #pragma simd for (int k = 0; k < n; k++) { sum += a[i * n + k] * b[k * n + j]; } c[i * n + j] = sum; } } cilk::reducer_opadd <double> diag(0.0); cilk_for(int i = 0; i < n; i++) { diag += c[i * n + i]; } return diag.get_value(); }
Final test with all functions
Final test with all functions
Locks & Waits
HPC Performance Characterization
Microarchitecture
General Exploration
Memory Access
references
https://en.wikipedia.org/wiki/VTune
https://software.intel.com/en-us/get-started-with-vtune
https://software.intel.com/en-us/vtune-amplifier-help-analysis-types
https://software.intel.com/en-us/vtune-amplifier-help-basic-hotspots-analysis
https://software.intel.com/en-us/vtune-amplifier-help-advanced-hotspots-analysis
https://software.intel.com/en-us/vtune-amplifier-help-concurrency-analysis
https://software.intel.com/en-us/vtune-amplifier-help-locks-and-waits-analysis
https://software.intel.com/en-us/vtune-amplifier-help-hpc-performance-characterization-analysis
https://software.intel.com/en-us/vtune-amplifier-help-general-exploration-analysis
https://software.intel.com/en-us/vtune-amplifier-help-memory-access-analysis