Difference between revisions of "GPU621/False Sharing"
(→Project Name) |
(→Analyzing Workshop Example) |
||
Line 15: | Line 15: | ||
=== Analyzing Workshop Example === | === Analyzing Workshop Example === | ||
+ | <pre> | ||
+ | #include <iostream> | ||
+ | #include <iomanip> | ||
+ | #include <cstdlib> | ||
+ | #include <chrono> | ||
+ | #include <omp.h> | ||
+ | |||
+ | #define NUM_THREADS 8 | ||
+ | |||
+ | using namespace std::chrono; | ||
+ | |||
+ | // report system time | ||
+ | void reportTime(const char* msg, steady_clock::duration span) | ||
+ | { | ||
+ | auto ms = duration_cast<milliseconds>(span); | ||
+ | std::cout << msg << " - took - " << | ||
+ | ms.count() << " milliseconds" << std::endl; | ||
+ | } | ||
+ | |||
+ | int main(int argc, char** argv) | ||
+ | { | ||
+ | if (argc != 2) | ||
+ | { | ||
+ | std::cerr << argv[0] << ": invalid number of arguments\n"; | ||
+ | std::cerr << "Usage: " << argv[0] << " no_of_slices\n"; | ||
+ | return 1; | ||
+ | } | ||
+ | int n = std::atoi(argv[1]); | ||
+ | steady_clock::time_point ts, te; | ||
+ | |||
+ | // calculate pi by integrating the area under 1/(1 + x^2) in n steps | ||
+ | ts = steady_clock::now(); | ||
+ | |||
+ | int actual_thread_count; | ||
+ | double pi = 0.0f; | ||
+ | double sum[NUM_THREADS] = { 0.0f }; | ||
+ | double step = 1.0 / (double)n; | ||
+ | |||
+ | omp_set_num_threads(NUM_THREADS); | ||
+ | #pragma omp parallel | ||
+ | { | ||
+ | int id, num_threads; | ||
+ | double x; | ||
+ | |||
+ | id = omp_get_thread_num(); | ||
+ | num_threads = omp_get_num_threads(); | ||
+ | |||
+ | // get master thread to return how many threads were actually created | ||
+ | if (id == 0) | ||
+ | { | ||
+ | actual_thread_count = num_threads; | ||
+ | } | ||
+ | |||
+ | // each thread is responsible for calculating the area of a specific set of sections underneath the curve | ||
+ | for (int i = id; i < n; i = i + num_threads) | ||
+ | { | ||
+ | x = ((double)i + 0.5f) * step; | ||
+ | sum[id] += 1.0f / (1.0f + x * x); | ||
+ | } | ||
+ | } | ||
+ | |||
+ | // sum up each calculation to get approximation of pi | ||
+ | for (int i = 0; i < actual_thread_count; i++) | ||
+ | { | ||
+ | pi += 4 * sum[i] * step; | ||
+ | } | ||
+ | |||
+ | te = steady_clock::now(); | ||
+ | |||
+ | std::cout << "n = " << n << | ||
+ | std::fixed << std::setprecision(15) << | ||
+ | "\n pi(exact) = " << 3.141592653589793 << | ||
+ | "\n pi(calcd) = " << pi << std::endl; | ||
+ | reportTime("Integration", te - ts); | ||
+ | } | ||
+ | </pre> | ||
== Solutions to False Sharing == | == Solutions to False Sharing == |
Revision as of 01:09, 26 November 2021
Contents
Analyzing False Sharing
Group Members
- Kevin Chou
Introduction
The Cache
What is a Cache?
Cache Coherence and Cache Line
False Sharing
Analyzing Workshop Example
#include <iostream> #include <iomanip> #include <cstdlib> #include <chrono> #include <omp.h> #define NUM_THREADS 8 using namespace std::chrono; // report system time void reportTime(const char* msg, steady_clock::duration span) { auto ms = duration_cast<milliseconds>(span); std::cout << msg << " - took - " << ms.count() << " milliseconds" << std::endl; } int main(int argc, char** argv) { if (argc != 2) { std::cerr << argv[0] << ": invalid number of arguments\n"; std::cerr << "Usage: " << argv[0] << " no_of_slices\n"; return 1; } int n = std::atoi(argv[1]); steady_clock::time_point ts, te; // calculate pi by integrating the area under 1/(1 + x^2) in n steps ts = steady_clock::now(); int actual_thread_count; double pi = 0.0f; double sum[NUM_THREADS] = { 0.0f }; double step = 1.0 / (double)n; omp_set_num_threads(NUM_THREADS); #pragma omp parallel { int id, num_threads; double x; id = omp_get_thread_num(); num_threads = omp_get_num_threads(); // get master thread to return how many threads were actually created if (id == 0) { actual_thread_count = num_threads; } // each thread is responsible for calculating the area of a specific set of sections underneath the curve for (int i = id; i < n; i = i + num_threads) { x = ((double)i + 0.5f) * step; sum[id] += 1.0f / (1.0f + x * x); } } // sum up each calculation to get approximation of pi for (int i = 0; i < actual_thread_count; i++) { pi += 4 * sum[i] * step; } te = steady_clock::now(); std::cout << "n = " << n << std::fixed << std::setprecision(15) << "\n pi(exact) = " << 3.141592653589793 << "\n pi(calcd) = " << pi << std::endl; reportTime("Integration", te - ts); }