Difference between revisions of "GPU621/False Sharing"

From CDOT Wiki
Jump to: navigation, search
(Project Name)
(Analyzing Workshop Example)
Line 15: Line 15:
  
 
=== Analyzing Workshop Example ===
 
=== Analyzing Workshop Example ===
 +
<pre>
 +
#include <iostream>
 +
#include <iomanip>
 +
#include <cstdlib>
 +
#include <chrono>
 +
#include <omp.h>
 +
 +
#define NUM_THREADS 8
 +
 +
using namespace std::chrono;
 +
 +
// report system time
 +
void reportTime(const char* msg, steady_clock::duration span)
 +
{
 +
    auto ms = duration_cast<milliseconds>(span);
 +
    std::cout << msg << " - took - " <<
 +
        ms.count() << " milliseconds" << std::endl;
 +
}
 +
 +
int main(int argc, char** argv)
 +
{
 +
    if (argc != 2)
 +
    {
 +
        std::cerr << argv[0] << ": invalid number of arguments\n";
 +
        std::cerr << "Usage: " << argv[0] << "  no_of_slices\n";
 +
        return 1;
 +
    }
 +
    int n = std::atoi(argv[1]);
 +
    steady_clock::time_point ts, te;
 +
 +
    // calculate pi by integrating the area under 1/(1 + x^2) in n steps
 +
    ts = steady_clock::now();
 +
 +
    int actual_thread_count;
 +
    double pi = 0.0f;
 +
    double sum[NUM_THREADS] = { 0.0f };
 +
    double step = 1.0 / (double)n;
 +
 +
    omp_set_num_threads(NUM_THREADS);
 +
    #pragma omp parallel
 +
    {
 +
        int id, num_threads;
 +
        double x;
 +
 +
        id = omp_get_thread_num();
 +
        num_threads = omp_get_num_threads();
 +
 +
        // get master thread to return how many threads were actually created
 +
        if (id == 0)
 +
        {
 +
            actual_thread_count = num_threads;
 +
        }
 +
 +
        // each thread is responsible for calculating the area of a specific set of sections underneath the curve
 +
        for (int i = id; i < n; i = i + num_threads)
 +
        {
 +
            x = ((double)i + 0.5f) * step;
 +
            sum[id] += 1.0f / (1.0f + x * x);
 +
        }
 +
    }
 +
 +
    // sum up each calculation to get approximation of pi
 +
    for (int i = 0; i < actual_thread_count; i++)
 +
    {
 +
        pi += 4 * sum[i] * step;
 +
    }
 +
 +
    te = steady_clock::now();
 +
 +
    std::cout << "n = " << n <<
 +
        std::fixed << std::setprecision(15) <<
 +
        "\n pi(exact) = " << 3.141592653589793 <<
 +
        "\n pi(calcd) = " << pi << std::endl;
 +
    reportTime("Integration", te - ts);
 +
}
 +
</pre>
  
 
== Solutions to False Sharing ==
 
== Solutions to False Sharing ==

Revision as of 01:09, 26 November 2021

Analyzing False Sharing

Group Members

- Kevin Chou

Introduction

The Cache

What is a Cache?

Cache Coherence and Cache Line

False Sharing

Analyzing Workshop Example

#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <chrono>
#include <omp.h>

#define NUM_THREADS 8

using namespace std::chrono;

// report system time
void reportTime(const char* msg, steady_clock::duration span)
{
    auto ms = duration_cast<milliseconds>(span);
    std::cout << msg << " - took - " <<
        ms.count() << " milliseconds" << std::endl;
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        std::cerr << argv[0] << ": invalid number of arguments\n";
        std::cerr << "Usage: " << argv[0] << "  no_of_slices\n";
        return 1;
    }
    int n = std::atoi(argv[1]);
    steady_clock::time_point ts, te;

    // calculate pi by integrating the area under 1/(1 + x^2) in n steps
    ts = steady_clock::now();

    int actual_thread_count;
    double pi = 0.0f;
    double sum[NUM_THREADS] = { 0.0f };
    double step = 1.0 / (double)n;

    omp_set_num_threads(NUM_THREADS);
    #pragma omp parallel
    {
        int id, num_threads;
        double x;

        id = omp_get_thread_num();
        num_threads = omp_get_num_threads();

        // get master thread to return how many threads were actually created
        if (id == 0)
        {
            actual_thread_count = num_threads;
        }

        // each thread is responsible for calculating the area of a specific set of sections underneath the curve
        for (int i = id; i < n; i = i + num_threads)
        {
            x = ((double)i + 0.5f) * step;
            sum[id] += 1.0f / (1.0f + x * x);
        }
    }

    // sum up each calculation to get approximation of pi
    for (int i = 0; i < actual_thread_count; i++)
    {
        pi += 4 * sum[i] * step;
    }

    te = steady_clock::now();

    std::cout << "n = " << n <<
        std::fixed << std::setprecision(15) <<
        "\n pi(exact) = " << 3.141592653589793 <<
        "\n pi(calcd) = " << pi << std::endl;
    reportTime("Integration", te - ts);
}

Solutions to False Sharing

Padding

Synchronization

Other Alternatives

Conclusion

References