Changes

Jump to: navigation, search

DPS921/Intel Math Kernel Library

11,347 bytes removed, 21:41, 10 April 2021
Code Samples
=== Code Samples ===
These samples are directly from the Intel Math Kernal Library code examples.
 ==== Vector Add ==== <All our code>//==============================================================// Vector Add is examples were taken from the equivalent of a Hello, World! sample for data parallel// programs. Building and running the sample verifies that your development// environment is setup correctly and demonstrates the use of the core features// of DPC++. This sample runs on both CPU and GPU (or FPGA). When run, it// computes on both the CPU and offload device, then compares results. If the// code executes on both CPU and offload device, the device name and a success// message are displayed. And, your development environment is setup correctly!//// For comprehensive instructions regarding DPC++ Programming, go to// github intel library located at [https://software.intelgithub.com/en-us/oneapi-programming-guide and search based on// relevant terms noted in the comments.//// DPC++ material used in the code sample:// • A one dimensional array of data shared between CPU and offload device.// • A device queue and kernel.//==============================================================// Copyright © Intel Corporation//src// SPDXoneAPI-License-Identifier: MITsamples One API Github]// =============================================================/* #include <CL/sycl.hpp>#include <array>#include <iostream>#if FPGA || FPGA_EMULATOR#include <CL/sycl/INTEL/fpga_extensions.hpp>#endif*/ using namespace sycl; // Array size for this example.constexpr size_t array_size = 10000; // Create an exception handler for asynchronous SYCL exceptionsstatic auto exception_handler = [](sycl::exception_list e_list) { for (std::exception_ptr const &e : e_list) { try { std::rethrow_exception(e); } catch (std::exception const &e) {#if _DEBUG std::cout << "Failure" << std::endl;#endif std::terminate(); } }}; //************************************// Vector add in DPC++ on device: returns sum in 4th parameter "sum".//************************************void VectorAdd(queue Add &q, const int *a, const int *b, int *sum, size_t size) { // Create the range object for the arrays. range<1> num_items{size};  // Use parallel_for to run vector addition in parallel on device. This // executes the kernel. // 1st parameter is the number of work items. // 2nd parameter is the kernel, a lambda that specifies what to do per // work item. the parameter of the lambda is the work item id. // DPC++ supports unnamed lambda kernel by default. auto e MatMul = q.parallel_for(num_items, [=](auto i) { sum[i] = a[i] + b[i]; });  // q.parallel_for() is an asynchronous call. DPC++ runtime enqueues and runs // the kernel asynchronously. Wait for the asynchronous call to complete. e.wait();} //************************************// Initialize the array from 0 to array_size - 1//************************************void InitializeArray(int *a, size_t size) { for (size_t i = 0; i < size; i++) a[i] = i;} //************************************// Demonstrate vector add both in sequential on CPU and in parallel on device.//************************************int main() { // Create device selector for the device of your interest.#if FPGA_EMULATOR // DPC++ extension: FPGA emulator selector on systems without FPGA card. INTEL::fpga_emulator_selector d_selector;#elif FPGA // DPC++ extension: FPGA selector on systems with FPGA card. INTEL::fpga_selector d_selector;#else // The default device selector will select the most performant device. default_selector d_selector;#endif  try { queue q(d_selector, exception_handler);  // Print out the device information used for the kernel code. std::cout << "Running on device: " << q.get_device().get_info<info::device::name>() << "\n"; std::cout << "Vector size: " << array_size << "\n";  // Create arrays with "array_size" to store input and output data. Allocate // unified shared memory so that both CPU and device can access them. int *a = malloc_shared<int>(array_size, q); int *b = malloc_shared<int>(array_size, q); int *sum_sequential = malloc_shared<int>(array_size, q); int *sum_parallel = malloc_shared<int>(array_size, q);  if ((a == nullptr) || (b == nullptr) || (sum_sequential == nullptr) || (sum_parallel == nullptr)) { if (a != nullptr) free(a, q); if (b != nullptr) free(b, q); if (sum_sequential != nullptr) free(sum_sequential, q); if (sum_parallel != nullptr) free(sum_parallel, q);  std::cout << "Shared memory allocation failure.\n"; return -1; }  // Initialize input arrays with values from 0 to array_size - 1 InitializeArray(a, array_size); InitializeArray(b, array_size);  // Compute the sum of two arrays samples we included in sequential for validation. for (size_t i = 0; i < array_size; i++) sum_sequential[i] = a[i] + b[i];  // Vector addition in DPC++. VectorAdd(q, a, b, sum_parallel, array_size);  // Verify that the two arrays our presentation are equal. for (size_t i = 0; i < array_size; i++) {specifically located at if (sum_parallel[i] != sum_sequential[i]) { std::cout << "Vector add failed on device.\n"; return -1; } }  int indices[]{0, 1, 2, (array_size - 1)}; constexpr size_t indices_size = sizeof(indices) / sizeof(int);  // Print out the result of vector add. for (int i = 0; i < indices_size; i++) { int j = indices[i]; if (i == indices_size - 1) std::cout << "...\n"; std::cout << "[" << j << "]: " << j << " + " << j << " = " << sum_sequential[j] << "\n"; }  free(a, q); free(b, q); free(sum_sequential, q); free(sum_parallel, q); } catch (exception const &e) { std::cout << "An exception is caught while adding two vectors.\n"; std::terminate(); }  std::cout << "Vector add successfully completed on device.\n"; return 0;}</code> ==== Math Mul ==== ```//==============================================================// Copyright © 2020 Intel Corporation//// SPDX-License-Identifier: MIT// ============================================================= /** * Matrix_mul multiplies two large matrices both the CPU and the offload device, * then compares results. If the code executes on both CPU and the offload * device, the name of the offload device and a success message are displayed. * * For comprehensive instructions regarding DPC++ Programming, go to * https://software.intelgithub.com/enoneapi-ussrc/oneapioneAPI-programming-guide and search based on * relevant terms noted in the comments. */ #include <CL/sycl.hpp>#include <iostream>#include <limits> samples/blob/ dpc_common.hpp can be found in the dev-utilities include folder.master/DirectProgramming/ e.g., $ONEAPI_ROOTDPC%2B%2B/dev-utilitiesDenseLinearAlgebra/<version>matrix_mul/includesrc/dpc_commonmatrix_mul_omp.hpp#include "dpc_common.hpp" using namespace std;using namespace sycl; /** * Each element of the product matrix c[i][jcpp Mat Mul] is computed from a unique row and * column of the factor matrices, a[i][k] and b[k][j] */ // Matrix size constants.constexpr int m_size = 150 * 8; // Must be a multiple of 8.constexpr int M = m_size / 8;constexpr int N = m_size / 4;constexpr int P = m_size / 2; /** * Perform matrix multiplication on host to verify results from device. */int VerifyResult(float (*c_back)[P]); int main() { // Host memory buffer that device will write data back before destruction. float(*c_back)[P] = new float[M][P];  // Intialize c_back for (int i = 0; i < M; i++) for (int j = 0; j < P; j++) c_back[i][j] = 0.0f;  // Initialize the device queue with the default selector. The device queue is // used to enqueue kernels. It encapsulates all states needed for execution. try { queue q(default_selector{}, dpc_common::exception_handler);  cout << "Device: " << q.get_device().get_info<info::device:https:name>() << "\n";  // Create 2D buffers for matrices, buffer c is bound with host memory c_back  buffer<float, 2> a_buf(range(M, N)); buffer<float, 2> b_buf(range(N, P)); buffer c_buf(reinterpret_cast<float *>(c_back), range(M, P));  cout << "Problem size: c(" << M << "," << P << ") = a(" << M << "," << N << ") * b(" << N << "," << P << ")\n";  // Using three command groups to illustrate execution ordergithub. The use of com/oneapi-src/ first two command groups for initializing matrices is not the most oneAPI-samples/blob/ efficient way. It just demonstrates the implicit multiple command group master/DirectProgramming/ execution ordering.  DPC%2B%2B/DenseLinearAlgebra/ Submit command group to queue to initialize matrix a q.submit([&](auto &h) { vector-add/src/ Get write only access to the buffer on a devicevector-add-usm. accessor a(a_buf, h, write_only);  // Execute kernel. h.parallel_for(range(M, N), [=](auto index) { // Each element of matrix a is 1. a[index] = 1.0f; }); });  // Submit command group to queue to initialize matrix b q.submit([&](auto &h) { // Get write only access to the buffer on a device accessor b(b_buf, h, write_only);  // Execute kernel. h.parallel_for(range(N, P), [=](auto index) { // Each column of b is the sequence 1,2,...,N b[index] = index[0] + 1.0f; }); });  // Submit command group to queue to multiply matrices: c = a * b q.submit([&](auto &h) { // Read from a and b, write to c accessor a(a_buf, h, read_only); accessor b(b_buf, h, read_only); accessor c(c_buf, h, write_only);  int width_a = a_buf.get_range()[1];  // Execute kernel. h.parallel_for(range(M, P), [=](auto index) { // Get global position in Y direction. int row = index[0]; // Get global position in X direction. int col = index[1];  float sum = 0.0f;  // Compute the result of one element of c for (int i = 0; i < width_a; i++) { sum += a[row][i] * b[i][col]; }  c[index] = sum; }); }); } catch (sycl::exception const &e) { cout << "An exception is caught while multiplying matrices.\n"; terminate(); }  int result; cout << "Result of matrix multiplication using DPC++: "; result = VerifyResult(c_back); delete[] c_back;  return result;} bool ValueSame(float a, float b) { return fabs(a cpp Vector- b) < numeric_limits<float>::epsilon();} int VerifyResult(float (*c_back)[Padd]) { // Check that located at the results are correct by comparing with host computing. int i, j, k;  // 2D arrays on host sidelinks provided. float(*a_host)[N] = new float[M][N]; float(*b_host)[P] = new float[N][P]; float(*c_host)[P] = new float[M][P];  // Each element of matrix a is 1. for (i = 0; i < M; i++) for (j = 0; j < N; j++) a_host[i][j] = 1.0f;  // Each column of b_host is the sequence 1,2,...,N for (i = 0; i < N; i++) for (j = 0; j < P; j++) b_host[i][j] = i + 1.0f;  // c_host is initialized to zero. for (i = 0; i < M; i++) for (j = 0; j < P; j++) c_host[i][j] = 0.0f;  for (i = 0; i < M; i++) { for (k = 0; k < N; k++) { // Each element of the product is just the sum 1+2+...+n for (j = 0; j < P; j++) { c_host[i][j] += a_host[i][k] * b_host[k][j]; } } }  bool mismatch_found = false;  // Compare host side results with the result buffer from device side: print // mismatched data 5 times only. int print_count = 0;  for (i = 0; i < M; i++) { for (j = 0; j < P; j++) { if (!ValueSame(c_back[i][j], c_host[i][j])) { cout << "Fail - The result is incorrect for element: [" << i << ", " << j << "], expected: " << c_host[i][j] << ", but found: " << c_back[i][j] << "\n"; mismatch_found = true; print_count++; if (print_count == 5) break; } }  if (print_count == 5) break; }  delete[] a_host; delete[] b_host; delete[] c_host;  if (!mismatch_found) { cout << "Success - The results are correct!\n"; return 0; } else { cout << "Fail - The results mismatch!\n"; return -1; }}```
22
edits

Navigation menu