T* reduced = new T[max_threads];
T* scanRes = new T[max_threads];
#pragma omp parallel { int ntiles = omp_get_num_threads(); // Number of tiles int itile = omp_get_thread_num(); int tile_size = (size - 1) / ntiles + 1; int last_tile = ntiles - 1; int last_tile_size = size - last_tile * tile_size; if (itile == 0) nthreads = ntiles; // step 1 - reduce each tile separately for (int itile = 0; itile < ntiles; itile++) reduced[itile] = reduce(in + itile * tile_size, itile == last_tile ? last_tile_size : tile_size, combine, T(0)); // step 2 - perform exclusive scan on all tiles using reduction outputs // store results in scanRes[] excl_scan(reduced, scanRes, ntiles, combine, T(0)); // step 3 - scan each tile separately using scanRes[] for (int itile = 0; itile < ntiles; itile++) scan_fn(in + itile * tile_size, out + itile * tile_size, itile == last_tile ? last_tile_size : tile_size, combine, scanRes[itile]); } delete[] reduced; delete[] scanRes; } return nthreads;
}