62
edits
Changes
→Intel Memory Access Pattern Analysis
= Intel Memory Access Pattern Analysis =
We can use the MAP analysis tool to check for various memory issues, such as non-contiguous memory accesses and unit strides.
<source>#include <iostream>
using namespace std;
const long int SIZE = 3500000;
typedef struct tricky
{
int member1;
float member2;
} tricky;
tricky structArray[SIZE];
int main()
{
cout << "Starting.\n";
for (long int i = 0; i < SIZE; i++)
{
structArray[i].member1 = (i / 25) + i - 78;
}
cout << "Done.\n";
return EXIT_SUCCESS;
}
#include <iostream>
#include <time.h>
using namespace std;
const int LOOPS = 1500000;
const int SIZE = 14992;
const int STEPS = SIZE / 2;
float floatArray[SIZE];
double doubleArray[SIZE];
time_t start;
time_t finish;
int main()
{
// Contiguous data access, same number of iterations as the noncontiguous.
start = time(NULL);
#pragma nounroll
for (float i = 0; i < LOOPS; i++)
{
#pragma nounroll
for (int j = 0; j < STEPS; j += 1)
{
floatArray[j] = i;
}
}
finish = time(NULL);
cout << "Contiguous Float: " << finish - start << "\n";
// Contiguous data access on doubles, so that it should require roughly
// the same number of cache line loads as the 2-stride float loop.
start = time(NULL);
#pragma nounroll
for (double i = 0; i < LOOPS; i++)
{
#pragma nounroll
for (int j = 0; j < STEPS; j += 1)
{
doubleArray[j] = i;
}
}
finish = time(NULL);
cout << "Contiguous Double: " << finish - start << "\n";
// Stride-2 float. Same number of iterations as the contiguous version,
// same number of cache line loads as the double loop. Slower than both.
start = time(NULL);
#pragma nounroll
for (float i = 0; i < LOOPS; i++)
{
#pragma nounroll
for (int j = 0; j < STEPS * 2; j += 2)
{
floatArray[j] = i;
}
}
finish = time(NULL);
cout << "Noncontiguous Float: " << finish - start << "\n";
return EXIT_SUCCESS;
}</source>
= Intel Dependencies Analysis =