HCC
HCC is a single-source, C/C++ compiler for heterogeneous computing. It's optimized with HSA (http://www.hsafoundation.com/).
hc.hpp
Go to the documentation of this file.
1 //===----------------------------------------------------------------------===//
2 //
3 // This file is distributed under the University of Illinois Open Source
4 // License. See LICENSE.TXT for details.
5 //
6 //===----------------------------------------------------------------------===//
7 
13 #pragma once
14 
15 #include "hc_defines.h"
16 #include "kalmar_exception.h"
17 #include "kalmar_index.h"
18 #include "kalmar_runtime.h"
19 #include "kalmar_serialize.h"
20 #include "kalmar_launch.h"
21 #include "kalmar_buffer.h"
22 #include "kalmar_math.h"
23 
24 #include "hsa_atomic.h"
25 #include "kalmar_cpu_launch.h"
26 #include "hcc_features.hpp"
27 
28 #ifndef __HC__
29 # define __HC__ [[hc]]
30 #endif
31 
32 #ifndef __CPU__
33 # define __CPU__ [[cpu]]
34 #endif
35 
36 typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t;
37 
42 namespace Kalmar {
43  class HSAQueue;
44 };
45 
46 namespace hc {
47 
48 class AmPointerInfo;
49 
50 using namespace Kalmar::enums;
51 using namespace Kalmar::CLAMP;
52 
53 
54 // forward declaration
55 class accelerator;
56 class accelerator_view;
57 class completion_future;
58 template <int N> class extent;
59 template <int N> class tiled_extent;
60 template <typename T, int N> class array_view;
61 template <typename T, int N> class array;
62 
63 
64 
65 // namespace alias
66 // namespace hc::fast_math is an alias of namespace Kalmar::fast_math
67 namespace fast_math = Kalmar::fast_math;
68 
69 // namespace hc::precise_math is an alias of namespace Kalmar::precise_math
71 
72 // type alias
73 
77 template <int N>
79 
83 
84 // ------------------------------------------------------------------------
85 // global functions
86 // ------------------------------------------------------------------------
87 
93 inline uint64_t get_system_ticks() {
94  return Kalmar::getContext()->getSystemTicks();
95 }
96 
103 inline uint64_t get_tick_frequency() {
104  return Kalmar::getContext()->getSystemTickFrequency();
105 }
106 
107 #define GET_SYMBOL_ADDRESS(acc, symbol) \
108  acc.get_symbol_address( #symbol );
109 
110 
111 // ------------------------------------------------------------------------
112 // accelerator_view
113 // ------------------------------------------------------------------------
114 
121 public:
130  pQueue(other.pQueue) {}
131 
142  pQueue = other.pQueue;
143  return *this;
144  }
145 
152  queuing_mode get_queuing_mode() const { return pQueue->get_mode(); }
153 
157  execute_order get_execute_order() const { return pQueue->get_execute_order(); }
158 
170  // FIXME: dummy implementation now
171  bool get_is_auto_selection() { return false; }
172 
182  unsigned int get_version() const;
183 
187  accelerator get_accelerator() const;
188 
196  // FIXME: dummy implementation now
197  bool get_is_debug() const { return 0; }
198 
208  void wait(hcWaitMode waitMode = hcWaitModeBlocked) {
209  pQueue->wait(waitMode);
210  Kalmar::getContext()->flushPrintfBuffer();
211  }
212 
241  void flush() { pQueue->flush(); }
242 
262  completion_future create_marker(memory_scope fence_scope=system_scope) const;
263 
289  completion_future create_blocking_marker(completion_future& dependent_future, memory_scope fence_scope=system_scope) const;
290 
313  completion_future create_blocking_marker(std::initializer_list<completion_future> dependent_future_list, memory_scope fence_scope=system_scope) const;
314 
315 
333  template<typename InputIterator>
334  completion_future create_blocking_marker(InputIterator first, InputIterator last, memory_scope scope) const;
335 
343  void copy(const void *src, void *dst, size_t size_bytes) {
344  pQueue->copy(src, dst, size_bytes);
345  }
346 
347 
360  void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, const hc::accelerator *copyAcc, bool forceUnpinnedCopy);
361 
362 
363  // TODO - this form is deprecated, provided for use with older HIP runtimes.
364  void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, bool forceUnpinnedCopy) ;
365 
380  completion_future copy_async(const void *src, void *dst, size_t size_bytes);
381 
382 
407  completion_future copy_async_ext(const void *src, void *dst, size_t size_bytes,
408  hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo,
409  const hc::accelerator *copyAcc);
410 
419  bool operator==(const accelerator_view& other) const {
420  return pQueue == other.pQueue;
421  }
422 
431  bool operator!=(const accelerator_view& other) const { return !(*this == other); }
432 
438  return pQueue.get()->getDev()->GetMaxTileStaticSize();
439  }
440 
448  return pQueue->getPendingAsyncOps();
449  }
450 
458  bool get_is_empty() {
459  return pQueue->isEmpty();
460  }
461 
468  void* get_hsa_queue() {
469  return pQueue->getHSAQueue();
470  }
471 
478  void* get_hsa_agent() {
479  return pQueue->getHSAAgent();
480  }
481 
491  return pQueue->getHSAAMRegion();
492  }
493 
494 
504  return pQueue->getHSAAMHostRegion();
505  }
506 
516  return pQueue->getHSACoherentAMHostRegion();
517  }
518 
527  return pQueue->getHSAKernargRegion();
528  }
529 
534  return pQueue->hasHSAInterOp();
535  }
536 
597  void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql,
598  const void * args, size_t argsize,
599  hc::completion_future *cf=nullptr, const char *kernel_name = nullptr)
600  {
601  pQueue->dispatch_hsa_kernel(aql, args, argsize, cf, kernel_name);
602  }
603 
618  bool set_cu_mask(const std::vector<bool>& cu_mask) {
619  // If it is HSA based accelerator view, set cu mask, otherwise, return;
620  if(is_hsa_accelerator()) {
621  return pQueue->set_cu_mask(cu_mask);
622  }
623  return false;
624  }
625 
626 private:
627  accelerator_view(std::shared_ptr<Kalmar::KalmarQueue> pQueue) : pQueue(pQueue) {}
628  std::shared_ptr<Kalmar::KalmarQueue> pQueue;
629 
630  friend class accelerator;
631  template <typename Q, int K> friend class array;
632  template <typename Q, int K> friend class array_view;
633 
634  template<typename Kernel> friend
635  void* Kalmar::mcw_cxxamp_get_kernel(const std::shared_ptr<Kalmar::KalmarQueue>&, const Kernel&);
636  template<typename Kernel, int dim_ext> friend
637  void Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&, void*, size_t);
638  template<typename Kernel, int dim_ext> friend
639  std::shared_ptr<Kalmar::KalmarAsyncOp> Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&, void*, size_t);
640  template<typename Kernel, int dim_ext> friend
641  void Kalmar::mcw_cxxamp_launch_kernel(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&);
642  template<typename Kernel, int dim_ext> friend
643  std::shared_ptr<Kalmar::KalmarAsyncOp> Kalmar::mcw_cxxamp_launch_kernel_async(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&);
644 
645 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
646  template <typename Kernel, int N> friend
647  completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>&, Kernel const&, extent<N> const&);
648 #endif
649 
650  // non-tiled parallel_for_each
651  // generic version
652  template <int N, typename Kernel> friend
653  completion_future parallel_for_each(const accelerator_view&, const extent<N>&, const Kernel&);
654 
655  // 1D specialization
656  template <typename Kernel> friend
657  completion_future parallel_for_each(const accelerator_view&, const extent<1>&, const Kernel&);
658 
659  // 2D specialization
660  template <typename Kernel> friend
661  completion_future parallel_for_each(const accelerator_view&, const extent<2>&, const Kernel&);
662 
663  // 3D specialization
664  template <typename Kernel> friend
665  completion_future parallel_for_each(const accelerator_view&, const extent<3>&, const Kernel&);
666 
667  // tiled parallel_for_each, 3D version
668  template <typename Kernel> friend
669  completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&);
670 
671  // tiled parallel_for_each, 2D version
672  template <typename Kernel> friend
673  completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&);
674 
675  // tiled parallel_for_each, 1D version
676  template <typename Kernel> friend
677  completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&);
678 
679 
680 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
681 public:
682 #endif
683  __attribute__((annotate("user_deserialize")))
684  accelerator_view() __CPU__ __HC__ {
685 #if __KALMAR_ACCELERATOR__ != 1
686  throw runtime_exception("errorMsg_throw", 0);
687 #endif
688  }
689 };
690 
691 // ------------------------------------------------------------------------
692 // accelerator
693 // ------------------------------------------------------------------------
694 
701 {
702 public:
713  accelerator() : accelerator(L"default") {}
714 
730  explicit accelerator(const std::wstring& path)
731  : pDev(Kalmar::getContext()->getDevice(path)) {}
732 
740  accelerator(const accelerator& other) : pDev(other.pDev) {}
741 
749  static std::vector<accelerator> get_all() {
750  auto Devices = Kalmar::getContext()->getDevices();
751  std::vector<accelerator> ret;
752  for(auto&& i : Devices)
753  ret.push_back(i);
754  return ret;
755  }
756 
770  static bool set_default(const std::wstring& path) {
771  return Kalmar::getContext()->set_default(path);
772  }
773 
790  return Kalmar::getContext()->auto_select();
791  }
792 
803  pDev = other.pDev;
804  return *this;
805  }
806 
813  accelerator_view get_default_view() const { return pDev->get_default_queue(); }
814 
823  accelerator_view create_view(execute_order order = execute_in_order, queuing_mode mode = queuing_mode_automatic) {
824  auto pQueue = pDev->createQueue(order);
825  pQueue->set_mode(mode);
826  return pQueue;
827  }
828 
837  bool operator==(const accelerator& other) const { return pDev == other.pDev; }
838 
847  bool operator!=(const accelerator& other) const { return !(*this == other); }
848 
867  bool set_default_cpu_access_type(access_type type) {
868  pDev->set_access(type);
869  return true;
870  }
871 
877  std::wstring get_device_path() const { return pDev->get_path(); }
878 
882  std::wstring get_description() const { return pDev->get_description(); }
883 
890  unsigned int get_version() const { return pDev->get_version(); }
891 
899  // FIXME: dummy implementation now
900  bool get_has_display() const { return false; }
901 
907  size_t get_dedicated_memory() const { return pDev->get_mem(); }
908 
914  bool get_supports_double_precision() const { return pDev->is_double(); }
915 
922  bool get_supports_limited_double_precision() const { return pDev->is_lim_double(); }
923 
928  // FIXME: dummy implementation now
929  bool get_is_debug() const { return false; }
930 
935  bool get_is_emulated() const { return pDev->is_emulated(); }
936 
941  bool get_supports_cpu_shared_memory() const { return pDev->is_unified(); }
942 
946  access_type get_default_cpu_access_type() const { return pDev->get_access(); }
947 
948 
954  return get_default_view().get_max_tile_static_size();
955  }
956 
960  std::vector<accelerator_view> get_all_views() {
961  std::vector<accelerator_view> result;
962  std::vector< std::shared_ptr<Kalmar::KalmarQueue> > queues = pDev->get_all_queues();
963  for (auto q : queues) {
964  result.push_back(q);
965  }
966  return result;
967  }
968 
977  void* get_hsa_am_region() const {
978  return get_default_view().get_hsa_am_region();
979  }
980 
989  void* get_hsa_am_system_region() const {
990  return get_default_view().get_hsa_am_system_region();
991  }
992 
1002  return get_default_view().get_hsa_am_finegrained_system_region();
1003  }
1004 
1012  void* get_hsa_kernarg_region() const {
1013  return get_default_view().get_hsa_kernarg_region();
1014  }
1015 
1019  bool is_hsa_accelerator() const {
1020  return get_default_view().is_hsa_accelerator();
1021  }
1022 
1029  hcAgentProfile get_profile() const {
1030  return pDev->getProfile();
1031  }
1032 
1033  void memcpy_symbol(const char* symbolName, void* hostptr, size_t count, size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {
1034  pDev->memcpySymbol(symbolName, hostptr, count, offset, kind);
1035  }
1036 
1037  void memcpy_symbol(void* symbolAddr, void* hostptr, size_t count, size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {
1038  pDev->memcpySymbol(symbolAddr, hostptr, count, offset, kind);
1039  }
1040 
1041  void* get_symbol_address(const char* symbolName) {
1042  return pDev->getSymbolAddress(symbolName);
1043  }
1044 
1051  void* get_hsa_agent() const {
1052  return pDev->getHSAAgent();
1053  }
1054 
1061  bool get_is_peer(const accelerator& other) const {
1062  return pDev->is_peer(other.pDev);
1063  }
1064 
1070  std::vector<accelerator> get_peers() const {
1071  std::vector<accelerator> peers;
1072 
1073  const auto &accs = get_all();
1074 
1075  for(auto iter = accs.begin(); iter != accs.end(); iter++)
1076  {
1077  if(this->get_is_peer(*iter))
1078  peers.push_back(*iter);
1079  }
1080  return peers;
1081  }
1082 
1087  unsigned int get_cu_count() const {
1088  return pDev->get_compute_unit_count();
1089  }
1090 
1095  int get_seqnum() const {
1096  return pDev->get_seqnum();
1097  }
1098 
1099 
1107  return pDev->has_cpu_accessible_am();
1108  };
1109 
1110  Kalmar::KalmarDevice *get_dev_ptr() const { return pDev; };
1111 
1112 private:
1113  accelerator(Kalmar::KalmarDevice* pDev) : pDev(pDev) {}
1114  friend class accelerator_view;
1115  Kalmar::KalmarDevice* pDev;
1116 };
1117 
1118 // ------------------------------------------------------------------------
1119 // completion_future
1120 // ------------------------------------------------------------------------
1121 
1131 public:
1132 
1138  completion_future() : __amp_future(), __thread_then(nullptr), __asyncOp(nullptr) {};
1139 
1148  : __amp_future(other.__amp_future), __thread_then(other.__thread_then), __asyncOp(other.__asyncOp) {}
1149 
1160  : __amp_future(std::move(other.__amp_future)), __thread_then(other.__thread_then), __asyncOp(other.__asyncOp) {}
1161 
1171  if (this != &_Other) {
1172  __amp_future = _Other.__amp_future;
1173  __thread_then = _Other.__thread_then;
1174  __asyncOp = _Other.__asyncOp;
1175  }
1176  return (*this);
1177  }
1178 
1189  if (this != &_Other) {
1190  __amp_future = std::move(_Other.__amp_future);
1191  __thread_then = _Other.__thread_then;
1192  __asyncOp = _Other.__asyncOp;
1193  }
1194  return (*this);
1195  }
1196 
1204  void get() const {
1205  __amp_future.get();
1206  }
1207 
1213  bool valid() const {
1214  return __amp_future.valid();
1215  }
1216 
1235  void wait(hcWaitMode mode = hcWaitModeBlocked) const {
1236  if (this->valid()) {
1237  if (__asyncOp != nullptr) {
1238  __asyncOp->setWaitMode(mode);
1239  }
1240  //TODO-ASYNC - need to reclaim older AsyncOps here.
1241  __amp_future.wait();
1242  }
1243 
1244  Kalmar::getContext()->flushPrintfBuffer();
1245  }
1246 
1247  template <class _Rep, class _Period>
1248  std::future_status wait_for(const std::chrono::duration<_Rep, _Period>& _Rel_time) const {
1249  return __amp_future.wait_for(_Rel_time);
1250  }
1251 
1252  template <class _Clock, class _Duration>
1253  std::future_status wait_until(const std::chrono::time_point<_Clock, _Duration>& _Abs_time) const {
1254  return __amp_future.wait_until(_Abs_time);
1255  }
1256 
1264  operator std::shared_future<void>() const {
1265  return __amp_future;
1266  }
1267 
1274  // FIXME: notice we removed const from the signature here
1275  // the original signature in the specification should be
1276  // template<typename functor>
1277  // void then(const functor& func) const;
1278  template<typename functor>
1279  void then(const functor & func) {
1280 #if __KALMAR_ACCELERATOR__ != 1
1281  // could only assign once
1282  if (__thread_then == nullptr) {
1283  // spawn a new thread to wait on the future and then execute the callback functor
1284  __thread_then = new std::thread([&]() __CPU__ {
1285  this->wait();
1286  if(this->valid())
1287  func();
1288  });
1289  }
1290 #endif
1291  }
1292 
1302  void* get_native_handle() const {
1303  if (__asyncOp != nullptr) {
1304  return __asyncOp->getNativeHandle();
1305  } else {
1306  return nullptr;
1307  }
1308  }
1309 
1316  uint64_t get_begin_tick() {
1317  if (__asyncOp != nullptr) {
1318  return __asyncOp->getBeginTimestamp();
1319  } else {
1320  return 0L;
1321  }
1322  }
1323 
1330  uint64_t get_end_tick() {
1331  if (__asyncOp != nullptr) {
1332  return __asyncOp->getEndTimestamp();
1333  } else {
1334  return 0L;
1335  }
1336  }
1337 
1344  uint64_t get_tick_frequency() {
1345  if (__asyncOp != nullptr) {
1346  return __asyncOp->getTimestampFrequency();
1347  } else {
1348  return 0L;
1349  }
1350  }
1351 
1357  bool is_ready() {
1358  if (__asyncOp != nullptr) {
1359  return __asyncOp->isReady();
1360  } else {
1361  return false;
1362  }
1363  }
1364 
1365  ~completion_future() {
1366  if (__thread_then != nullptr) {
1367  __thread_then->join();
1368  }
1369  delete __thread_then;
1370  __thread_then = nullptr;
1371 
1372  if (__asyncOp != nullptr) {
1373  __asyncOp = nullptr;
1374  }
1375  }
1376 
1377 
1381  int get_use_count() const { return __asyncOp.use_count(); };
1382 
1383 private:
1384  std::shared_future<void> __amp_future;
1385  std::thread* __thread_then = nullptr;
1386  std::shared_ptr<Kalmar::KalmarAsyncOp> __asyncOp;
1387 
1388  completion_future(std::shared_ptr<Kalmar::KalmarAsyncOp> event) : __amp_future(*(event->getFuture())), __asyncOp(event) {}
1389 
1390  completion_future(const std::shared_future<void> &__future)
1391  : __amp_future(__future), __thread_then(nullptr), __asyncOp(nullptr) {}
1392 
1393  friend class Kalmar::HSAQueue;
1394 
1395  // non-tiled parallel_for_each
1396  // generic version
1397  template <int N, typename Kernel> friend
1398  completion_future parallel_for_each(const accelerator_view&, const extent<N>&, const Kernel&);
1399 
1400  // 1D specialization
1401  template <typename Kernel> friend
1402  completion_future parallel_for_each(const accelerator_view&, const extent<1>&, const Kernel&);
1403 
1404  // 2D specialization
1405  template <typename Kernel> friend
1406  completion_future parallel_for_each(const accelerator_view&, const extent<2>&, const Kernel&);
1407 
1408  // 3D specialization
1409  template <typename Kernel> friend
1410  completion_future parallel_for_each(const accelerator_view&, const extent<3>&, const Kernel&);
1411 
1412  // tiled parallel_for_each, 3D version
1413  template <typename Kernel> friend
1414  completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&);
1415 
1416  // tiled parallel_for_each, 2D version
1417  template <typename Kernel> friend
1418  completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&);
1419 
1420  // tiled parallel_for_each, 1D version
1421  template <typename Kernel> friend
1422  completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&);
1423 
1424  // copy_async
1425  template <typename T, int N> friend
1426  completion_future copy_async(const array_view<const T, N>& src, const array_view<T, N>& dest);
1427  template <typename T, int N> friend
1428  completion_future copy_async(const array<T, N>& src, array<T, N>& dest);
1429  template <typename T, int N> friend
1430  completion_future copy_async(const array<T, N>& src, const array_view<T, N>& dest);
1431  template <typename T, int N> friend
1432  completion_future copy_async(const array_view<T, N>& src, const array_view<T, N>& dest);
1433  template <typename T, int N> friend
1434  completion_future copy_async(const array_view<const T, N>& src, array<T, N>& dest);
1435 
1436  template <typename InputIter, typename T, int N> friend
1437  completion_future copy_async(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest);
1438  template <typename InputIter, typename T, int N> friend
1439  completion_future copy_async(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest);
1440  template <typename InputIter, typename T, int N> friend
1441  completion_future copy_async(InputIter srcBegin, array<T, N>& dest);
1442  template <typename InputIter, typename T, int N> friend
1443  completion_future copy_async(InputIter srcBegin, const array_view<T, N>& dest);
1444  template <typename OutputIter, typename T, int N> friend
1445  completion_future copy_async(const array<T, N>& src, OutputIter destBegin);
1446  template <typename OutputIter, typename T, int N> friend
1447  completion_future copy_async(const array_view<T, N>& src, OutputIter destBegin);
1448 
1449  // array_view
1450  template <typename T, int N> friend class array_view;
1451 
1452  // accelerator_view
1453  friend class accelerator_view;
1454 };
1455 
1456 // ------------------------------------------------------------------------
1457 // member function implementations
1458 // ------------------------------------------------------------------------
1459 
1460 inline accelerator
1461 accelerator_view::get_accelerator() const { return pQueue->getDev(); }
1462 
1463 inline completion_future
1464 accelerator_view::create_marker(memory_scope scope) const {
1465  std::shared_ptr<Kalmar::KalmarAsyncOp> deps[1];
1466  // If necessary create an explicit dependency on previous command
1467  // This is necessary for example if copy command is followed by marker - we need the marker to wait for the copy to complete.
1468  std::shared_ptr<Kalmar::KalmarAsyncOp> depOp = pQueue->detectStreamDeps(hcCommandMarker, nullptr);
1469 
1470  int cnt = 0;
1471  if (depOp) {
1472  deps[cnt++] = depOp; // retrieve async op associated with completion_future
1473  }
1474 
1475  return completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope));
1476 }
1477 
1478 inline unsigned int accelerator_view::get_version() const { return get_accelerator().get_version(); }
1479 
1480 inline completion_future accelerator_view::create_blocking_marker(completion_future& dependent_future, memory_scope scope) const {
1481  std::shared_ptr<Kalmar::KalmarAsyncOp> deps[2];
1482 
1483  // If necessary create an explicit dependency on previous command
1484  // This is necessary for example if copy command is followed by marker - we need the marker to wait for the copy to complete.
1485  std::shared_ptr<Kalmar::KalmarAsyncOp> depOp = pQueue->detectStreamDeps(hcCommandMarker, nullptr);
1486 
1487  int cnt = 0;
1488  if (depOp) {
1489  deps[cnt++] = depOp; // retrieve async op associated with completion_future
1490  }
1491 
1492  if (dependent_future.__asyncOp) {
1493  deps[cnt++] = dependent_future.__asyncOp; // retrieve async op associated with completion_future
1494  }
1495 
1496  return completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope));
1497 }
1498 
1499 template<typename InputIterator>
1500 inline completion_future
1501 accelerator_view::create_blocking_marker(InputIterator first, InputIterator last, memory_scope scope) const {
1502  std::shared_ptr<Kalmar::KalmarAsyncOp> deps[5]; // array of 5 pointers to the native handle of async ops. 5 is the max supported by barrier packet
1503  hc::completion_future lastMarker;
1504 
1505 
1506  // If necessary create an explicit dependency on previous command
1507  // This is necessary for example if copy command is followed by marker - we need the marker to wait for the copy to complete.
1508  std::shared_ptr<Kalmar::KalmarAsyncOp> depOp = pQueue->detectStreamDeps(hcCommandMarker, nullptr);
1509 
1510  int cnt = 0;
1511  if (depOp) {
1512  deps[cnt++] = depOp; // retrieve async op associated with completion_future
1513  }
1514 
1515 
1516  // loop through signals and group into sections of 5
1517  // every 5 signals goes into one barrier packet
1518  // since HC sets the barrier bit in each AND barrier packet, we know
1519  // the barriers will execute in-order
1520  for (auto iter = first; iter != last; ++iter) {
1521  if (iter->__asyncOp) {
1522  deps[cnt++] = iter->__asyncOp; // retrieve async op associated with completion_future
1523  if (cnt == 5) {
1524  lastMarker = completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, hc::no_scope));
1525  cnt = 0;
1526  }
1527  }
1528  }
1529 
1530  if (cnt) {
1531  lastMarker = completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope));
1532  }
1533 
1534  return lastMarker;
1535 }
1536 
1537 inline completion_future
1538 accelerator_view::create_blocking_marker(std::initializer_list<completion_future> dependent_future_list, memory_scope scope) const {
1539  return create_blocking_marker(dependent_future_list.begin(), dependent_future_list.end(), scope);
1540 }
1541 
1542 
1543 inline void accelerator_view::copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, const hc::accelerator *copyAcc, bool forceUnpinnedCopy) {
1544  pQueue->copy_ext(src, dst, size_bytes, copyDir, srcInfo, dstInfo, copyAcc ? copyAcc->pDev : nullptr, forceUnpinnedCopy);
1545 };
1546 
1547 inline void accelerator_view::copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, bool forceHostCopyEngine) {
1548  pQueue->copy_ext(src, dst, size_bytes, copyDir, srcInfo, dstInfo, forceHostCopyEngine);
1549 };
1550 
1551 inline completion_future
1552 accelerator_view::copy_async(const void *src, void *dst, size_t size_bytes) {
1553  return completion_future(pQueue->EnqueueAsyncCopy(src, dst, size_bytes));
1554 }
1555 
1556 inline completion_future
1557 accelerator_view::copy_async_ext(const void *src, void *dst, size_t size_bytes,
1558  hcCommandKind copyDir,
1559  const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo,
1560  const hc::accelerator *copyAcc)
1561 {
1562  return completion_future(pQueue->EnqueueAsyncCopyExt(src, dst, size_bytes, copyDir, srcInfo, dstInfo, copyAcc ? copyAcc->pDev : nullptr));
1563 };
1564 
1565 
1566 // ------------------------------------------------------------------------
1567 // extent
1568 // ------------------------------------------------------------------------
1569 
1577 template <int N>
1578 class extent {
1579 public:
1583  static const int rank = N;
1584 
1588  typedef int value_type;
1589 
1594  extent() __CPU__ __HC__ : base_() {
1595  static_assert(N > 0, "Dimensionality must be positive");
1596  };
1597 
1604  extent(const extent& other) __CPU__ __HC__
1605  : base_(other.base_) {}
1606 
1616  explicit extent(int e0) __CPU__ __HC__
1617  : base_(e0) {}
1618 
1619  template <typename ..._Tp>
1620  explicit extent(_Tp ... __t) __CPU__ __HC__
1621  : base_(__t...) {
1622  static_assert(sizeof...(__t) <= 3, "Can only supply at most 3 individual coordinates in the constructor");
1623  static_assert(sizeof...(__t) == N, "rank should be consistency");
1624  }
1625 
1636  explicit extent(const int components[]) __CPU__ __HC__
1637  : base_(components) {}
1638 
1647  explicit extent(int components[]) __CPU__ __HC__
1648  : base_(components) {}
1649 
1657  extent& operator=(const extent& other) __CPU__ __HC__ {
1658  base_.operator=(other.base_);
1659  return *this;
1660  }
1661 
1669  int operator[] (unsigned int c) const __CPU__ __HC__ {
1670  return base_[c];
1671  }
1672  int& operator[] (unsigned int c) __CPU__ __HC__ {
1673  return base_[c];
1674  }
1675 
1686  bool contains(const index<N>& idx) const __CPU__ __HC__ {
1687  return Kalmar::amp_helper<N, index<N>, extent<N>>::contains(idx, *this);
1688  }
1689 
1695  unsigned int size() const __CPU__ __HC__ {
1696  return Kalmar::index_helper<N, extent<N>>::count_size(*this);
1697  }
1698 
1711  tiled_extent<1> tile(int t0) const;
1712  tiled_extent<2> tile(int t0, int t1) const;
1713  tiled_extent<3> tile(int t0, int t1, int t2) const;
1714 
1722  tiled_extent<1> tile_with_dynamic(int t0, int dynamic_size) const;
1723  tiled_extent<2> tile_with_dynamic(int t0, int t1, int dynamic_size) const;
1724  tiled_extent<3> tile_with_dynamic(int t0, int t1, int t2, int dynamic_size) const;
1725 
1738  bool operator==(const extent& other) const __CPU__ __HC__ {
1739  return Kalmar::index_helper<N, extent<N> >::equal(*this, other);
1740  }
1741  bool operator!=(const extent& other) const __CPU__ __HC__ {
1742  return !(*this == other);
1743  }
1744 
1755  extent& operator+=(const extent& __r) __CPU__ __HC__ {
1756  base_.operator+=(__r.base_);
1757  return *this;
1758  }
1759  extent& operator-=(const extent& __r) __CPU__ __HC__ {
1760  base_.operator-=(__r.base_);
1761  return *this;
1762  }
1763  extent& operator*=(const extent& __r) __CPU__ __HC__ {
1764  base_.operator*=(__r.base_);
1765  return *this;
1766  }
1767  extent& operator/=(const extent& __r) __CPU__ __HC__ {
1768  base_.operator/=(__r.base_);
1769  return *this;
1770  }
1771  extent& operator%=(const extent& __r) __CPU__ __HC__ {
1772  base_.operator%=(__r.base_);
1773  return *this;
1774  }
1775 
1786  extent operator+(const index<N>& idx) __CPU__ __HC__ {
1787  extent __r = *this;
1788  __r += idx;
1789  return __r;
1790  }
1791  extent operator-(const index<N>& idx) __CPU__ __HC__ {
1792  extent __r = *this;
1793  __r -= idx;
1794  return __r;
1795  }
1796  extent& operator+=(const index<N>& idx) __CPU__ __HC__ {
1797  base_.operator+=(idx.base_);
1798  return *this;
1799  }
1800  extent& operator-=(const index<N>& idx) __CPU__ __HC__ {
1801  base_.operator-=(idx.base_);
1802  return *this;
1803  }
1804 
1816  extent& operator+=(int value) __CPU__ __HC__ {
1817  base_.operator+=(value);
1818  return *this;
1819  }
1820  extent& operator-=(int value) __CPU__ __HC__ {
1821  base_.operator-=(value);
1822  return *this;
1823  }
1824  extent& operator*=(int value) __CPU__ __HC__ {
1825  base_.operator*=(value);
1826  return *this;
1827  }
1828  extent& operator/=(int value) __CPU__ __HC__ {
1829  base_.operator/=(value);
1830  return *this;
1831  }
1832  extent& operator%=(int value) __CPU__ __HC__ {
1833  base_.operator%=(value);
1834  return *this;
1835  }
1836 
1847  extent& operator++() __CPU__ __HC__ {
1848  base_.operator+=(1);
1849  return *this;
1850  }
1851  extent operator++(int) __CPU__ __HC__ {
1852  extent ret = *this;
1853  base_.operator+=(1);
1854  return ret;
1855  }
1856  extent& operator--() __CPU__ __HC__ {
1857  base_.operator-=(1);
1858  return *this;
1859  }
1860  extent operator--(int) __CPU__ __HC__ {
1861  extent ret = *this;
1862  base_.operator-=(1);
1863  return ret;
1864  }
1865 
1868 private:
1869  typedef Kalmar::index_impl<typename Kalmar::__make_indices<N>::type> base;
1870  base base_;
1871  template <int K, typename Q> friend struct Kalmar::index_helper;
1872  template <int K, typename Q1, typename Q2> friend struct Kalmar::amp_helper;
1873 };
1874 
1875 // ------------------------------------------------------------------------
1876 // global functions for extent
1877 // ------------------------------------------------------------------------
1878 
1889 // FIXME: the signature is not entirely the same as defined in:
1890 // C++AMP spec v1.2 #1253
1891 template <int N>
1892 extent<N> operator+(const extent<N>& lhs, const extent<N>& rhs) __CPU__ __HC__ {
1893  extent<N> __r = lhs;
1894  __r += rhs;
1895  return __r;
1896 }
1897 template <int N>
1898 extent<N> operator-(const extent<N>& lhs, const extent<N>& rhs) __CPU__ __HC__ {
1899  extent<N> __r = lhs;
1900  __r -= rhs;
1901  return __r;
1902 }
1903 
1920 // FIXME: the signature is not entirely the same as defined in:
1921 // C++AMP spec v1.2 #1259
1922 template <int N>
1923 extent<N> operator+(const extent<N>& ext, int value) __CPU__ __HC__ {
1924  extent<N> __r = ext;
1925  __r += value;
1926  return __r;
1927 }
1928 template <int N>
1929 extent<N> operator+(int value, const extent<N>& ext) __CPU__ __HC__ {
1930  extent<N> __r = ext;
1931  __r += value;
1932  return __r;
1933 }
1934 template <int N>
1935 extent<N> operator-(const extent<N>& ext, int value) __CPU__ __HC__ {
1936  extent<N> __r = ext;
1937  __r -= value;
1938  return __r;
1939 }
1940 template <int N>
1941 extent<N> operator-(int value, const extent<N>& ext) __CPU__ __HC__ {
1942  extent<N> __r(value);
1943  __r -= ext;
1944  return __r;
1945 }
1946 template <int N>
1947 extent<N> operator*(const extent<N>& ext, int value) __CPU__ __HC__ {
1948  extent<N> __r = ext;
1949  __r *= value;
1950  return __r;
1951 }
1952 template <int N>
1953 extent<N> operator*(int value, const extent<N>& ext) __CPU__ __HC__ {
1954  extent<N> __r = ext;
1955  __r *= value;
1956  return __r;
1957 }
1958 template <int N>
1959 extent<N> operator/(const extent<N>& ext, int value) __CPU__ __HC__ {
1960  extent<N> __r = ext;
1961  __r /= value;
1962  return __r;
1963 }
1964 template <int N>
1965 extent<N> operator/(int value, const extent<N>& ext) __CPU__ __HC__ {
1966  extent<N> __r(value);
1967  __r /= ext;
1968  return __r;
1969 }
1970 template <int N>
1971 extent<N> operator%(const extent<N>& ext, int value) __CPU__ __HC__ {
1972  extent<N> __r = ext;
1973  __r %= value;
1974  return __r;
1975 }
1976 template <int N>
1977 extent<N> operator%(int value, const extent<N>& ext) __CPU__ __HC__ {
1978  extent<N> __r(value);
1979  __r %= ext;
1980  return __r;
1981 }
1982 
1985 // ------------------------------------------------------------------------
1986 // tiled_extent
1987 // ------------------------------------------------------------------------
1988 
1995 template <int N>
1996 class tiled_extent : public extent<N> {
1997 public:
1998  static const int rank = N;
1999 
2003  int tile_dim[N];
2004 
2009  tiled_extent() __CPU__ __HC__ : extent<N>(), tile_dim{0} {}
2010 
2018  tiled_extent(const tiled_extent& other) __CPU__ __HC__ : extent<N>(other) {
2019  for (int i = 0; i < N; ++i) {
2020  tile_dim[i] = other.tile_dim[i];
2021  }
2022  }
2023 };
2024 
2030 template <>
2031 class tiled_extent<1> : public extent<1> {
2032 private:
2036  unsigned int dynamic_group_segment_size;
2037 
2038 public:
2039  static const int rank = 1;
2040 
2044  int tile_dim[1];
2045 
2050  tiled_extent() __CPU__ __HC__ : extent(0), dynamic_group_segment_size(0), tile_dim{0} {}
2051 
2059  tiled_extent(int e0, int t0) __CPU__ __HC__ : extent(e0), dynamic_group_segment_size(0), tile_dim{t0} {}
2060 
2069  tiled_extent(int e0, int t0, int size) __CPU__ __HC__ : extent(e0), dynamic_group_segment_size(size), tile_dim{t0} {}
2070 
2078  tiled_extent(const tiled_extent<1>& other) __CPU__ __HC__ : extent(other[0]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0]} {}
2079 
2080 
2087  tiled_extent(const extent<1>& ext, int t0) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0} {}
2088 
2096  tiled_extent(const extent<1>& ext, int t0, int size) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(size), tile_dim{t0} {}
2097 
2104  void set_dynamic_group_segment_size(unsigned int size) __CPU__ {
2105  dynamic_group_segment_size = size;
2106  }
2107 
2111  unsigned int get_dynamic_group_segment_size() const __CPU__ {
2112  return dynamic_group_segment_size;
2113  }
2114 };
2115 
2121 template <>
2122 class tiled_extent<2> : public extent<2> {
2123 private:
2127  unsigned int dynamic_group_segment_size;
2128 
2129 public:
2130  static const int rank = 2;
2131 
2135  int tile_dim[2];
2136 
2141  tiled_extent() __CPU__ __HC__ : extent(0, 0), dynamic_group_segment_size(0), tile_dim{0, 0} {}
2142 
2152  tiled_extent(int e0, int e1, int t0, int t1) __CPU__ __HC__ : extent(e0, e1), dynamic_group_segment_size(0), tile_dim{t0, t1} {}
2153 
2164  tiled_extent(int e0, int e1, int t0, int t1, int size) __CPU__ __HC__ : extent(e0, e1), dynamic_group_segment_size(size), tile_dim{t0, t1} {}
2165 
2173  tiled_extent(const tiled_extent<2>& other) __CPU__ __HC__ : extent(other[0], other[1]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0], other.tile_dim[1]} {}
2174 
2182  tiled_extent(const extent<2>& ext, int t0, int t1) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0, t1} {}
2183 
2192  tiled_extent(const extent<2>& ext, int t0, int t1, int size) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(size), tile_dim{t0, t1} {}
2193 
2200  void set_dynamic_group_segment_size(unsigned int size) __CPU__ {
2201  dynamic_group_segment_size = size;
2202  }
2203 
2207  unsigned int get_dynamic_group_segment_size() const __CPU__ {
2208  return dynamic_group_segment_size;
2209  }
2210 };
2211 
2217 template <>
2218 class tiled_extent<3> : public extent<3> {
2219 private:
2223  unsigned int dynamic_group_segment_size;
2224 
2225 public:
2226  static const int rank = 3;
2227 
2231  int tile_dim[3];
2232 
2237  tiled_extent() __CPU__ __HC__ : extent(0, 0, 0), dynamic_group_segment_size(0), tile_dim{0, 0, 0} {}
2238 
2250  tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2) __CPU__ __HC__ : extent(e0, e1, e2), dynamic_group_segment_size(0), tile_dim{t0, t1, t2} {}
2251 
2264  tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2, int size) __CPU__ __HC__ : extent(e0, e1, e2), dynamic_group_segment_size(size), tile_dim{t0, t1, t2} {}
2265 
2273  tiled_extent(const tiled_extent<3>& other) __CPU__ __HC__ : extent(other[0], other[1], other[2]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0], other.tile_dim[1], other.tile_dim[2]} {}
2274 
2283  tiled_extent(const extent<3>& ext, int t0, int t1, int t2) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0, t1, t2} {}
2284 
2294  tiled_extent(const extent<3>& ext, int t0, int t1, int t2, int size) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(size), tile_dim{t0, t1, t2} {}
2295 
2302  void set_dynamic_group_segment_size(unsigned int size) __CPU__ {
2303  dynamic_group_segment_size = size;
2304  }
2305 
2309  unsigned int get_dynamic_group_segment_size() const __CPU__ {
2310  return dynamic_group_segment_size;
2311  }
2312 };
2313 
2314 // ------------------------------------------------------------------------
2315 // implementation of extent<N>::tile()
2316 // ------------------------------------------------------------------------
2317 
2318 template <int N>
2319 inline
2320 tiled_extent<1> extent<N>::tile(int t0) const __CPU__ __HC__ {
2321  static_assert(N == 1, "One-dimensional tile() method only available on extent<1>");
2322  return tiled_extent<1>(*this, t0);
2323 }
2324 
2325 template <int N>
2326 inline
2327 tiled_extent<2> extent<N>::tile(int t0, int t1) const __CPU__ __HC__ {
2328  static_assert(N == 2, "Two-dimensional tile() method only available on extent<2>");
2329  return tiled_extent<2>(*this, t0, t1);
2330 }
2331 
2332 template <int N>
2333 inline
2334 tiled_extent<3> extent<N>::tile(int t0, int t1, int t2) const __CPU__ __HC__ {
2335  static_assert(N == 3, "Three-dimensional tile() method only available on extent<3>");
2336  return tiled_extent<3>(*this, t0, t1, t2);
2337 }
2338 
2339 // ------------------------------------------------------------------------
2340 // implementation of extent<N>::tile_with_dynamic()
2341 // ------------------------------------------------------------------------
2342 
2343 template <int N>
2344 inline
2345 tiled_extent<1> extent<N>::tile_with_dynamic(int t0, int dynamic_size) const __CPU__ __HC__ {
2346  static_assert(N == 1, "One-dimensional tile() method only available on extent<1>");
2347  return tiled_extent<1>(*this, t0, dynamic_size);
2348 }
2349 
2350 template <int N>
2351 inline
2352 tiled_extent<2> extent<N>::tile_with_dynamic(int t0, int t1, int dynamic_size) const __CPU__ __HC__ {
2353  static_assert(N == 2, "Two-dimensional tile() method only available on extent<2>");
2354  return tiled_extent<2>(*this, t0, t1, dynamic_size);
2355 }
2356 
2357 template <int N>
2358 inline
2359 tiled_extent<3> extent<N>::tile_with_dynamic(int t0, int t1, int t2, int dynamic_size) const __CPU__ __HC__ {
2360  static_assert(N == 3, "Three-dimensional tile() method only available on extent<3>");
2361  return tiled_extent<3>(*this, t0, t1, t2, dynamic_size);
2362 }
2363 
2364 // ------------------------------------------------------------------------
2365 // Intrinsic functions for HSAIL instructions
2366 // ------------------------------------------------------------------------
2367 
2373 #define __HSA_WAVEFRONT_SIZE__ (64)
2374 extern "C" unsigned int __wavesize() __HC__;
2375 
2376 
2377 #if __hcc_backend__==HCC_BACKEND_AMDGPU
2378 extern "C" inline unsigned int __wavesize() __HC__ {
2379  return __HSA_WAVEFRONT_SIZE__;
2380 }
2381 #endif
2382 
2389 extern "C" inline unsigned int __popcount_u32_b32(unsigned int input) __HC__ {
2390  return __builtin_popcount(input);
2391 }
2392 
2399 extern "C" inline unsigned int __popcount_u32_b64(unsigned long long int input) __HC__ {
2400  return __builtin_popcountl(input);
2401 }
2402 
2409 extern "C" inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__ {
2410  return (src0 << (32 - src1 - src2)) >> (32 - src2);
2411 }
2412 
2413 extern "C" uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;
2414 
2415 extern "C" int __bitextract_s32(int src0, unsigned int src1, unsigned int src2) __HC__;
2416 
2417 extern "C" int64_t __bitextract_s64(int64_t src0, unsigned int src1, unsigned int src2) __HC__;
2426 extern "C" unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) __HC__;
2427 
2428 extern "C" uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) __HC__;
2429 
2430 extern "C" int __bitinsert_s32(int src0, int src1, unsigned int src2, unsigned int src3) __HC__;
2431 
2432 extern "C" int64_t __bitinsert_s64(int64_t src0, int64_t src1, unsigned int src2, unsigned int src3) __HC__;
2441 extern "C" unsigned int __bitmask_b32(unsigned int src0, unsigned int src1) __HC__;
2442 
2443 extern "C" uint64_t __bitmask_b64(unsigned int src0, unsigned int src1) __HC__;
2453 unsigned int __bitrev_b32(unsigned int src0) [[hc]] __asm("llvm.bitreverse.i32");
2454 
2455 uint64_t __bitrev_b64(uint64_t src0) [[hc]] __asm("llvm.bitreverse.i64");
2456 
2465 extern "C" unsigned int __bitselect_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
2466 
2467 extern "C" uint64_t __bitselect_b64(uint64_t src0, uint64_t src1, uint64_t src2) __HC__;
2477 extern "C" inline unsigned int __firstbit_u32_u32(unsigned int input) __HC__ {
2478  return input == 0 ? -1 : __builtin_clz(input);
2479 }
2480 
2481 
2489 extern "C" inline unsigned int __firstbit_u32_u64(unsigned long long int input) __HC__ {
2490  return input == 0 ? -1 : __builtin_clzl(input);
2491 }
2492 
2502 extern "C" inline unsigned int __firstbit_u32_s32(int input) __HC__ {
2503  if (input == 0) {
2504  return -1;
2505  }
2506 
2507  return input > 0 ? __firstbit_u32_u32(input) : __firstbit_u32_u32(~input);
2508 }
2509 
2510 
2520 extern "C" inline unsigned int __firstbit_u32_s64(long long int input) __HC__ {
2521  if (input == 0) {
2522  return -1;
2523  }
2524 
2525  return input > 0 ? __firstbit_u32_u64(input) : __firstbit_u32_u64(~input);
2526 }
2527 
2535 extern "C" inline unsigned int __lastbit_u32_u32(unsigned int input) __HC__ {
2536  return input == 0 ? -1 : __builtin_ctz(input);
2537 }
2538 
2539 extern "C" inline unsigned int __lastbit_u32_u64(unsigned long long int input) __HC__ {
2540  return input == 0 ? -1 : __builtin_ctzl(input);
2541 }
2542 
2543 extern "C" inline unsigned int __lastbit_u32_s32(int input) __HC__ {
2544  return __lastbit_u32_u32(input);
2545 }
2546 
2547 extern "C" inline unsigned int __lastbit_u32_s64(unsigned long long input) __HC__ {
2548  return __lastbit_u32_u64(input);
2549 }
2559 extern "C" unsigned int __unpacklo_u8x4(unsigned int src0, unsigned int src1) __HC__;
2560 
2561 extern "C" uint64_t __unpacklo_u8x8(uint64_t src0, uint64_t src1) __HC__;
2562 
2563 extern "C" unsigned int __unpacklo_u16x2(unsigned int src0, unsigned int src1) __HC__;
2564 
2565 extern "C" uint64_t __unpacklo_u16x4(uint64_t src0, uint64_t src1) __HC__;
2566 
2567 extern "C" uint64_t __unpacklo_u32x2(uint64_t src0, uint64_t src1) __HC__;
2568 
2569 extern "C" int __unpacklo_s8x4(int src0, int src1) __HC__;
2570 
2571 extern "C" int64_t __unpacklo_s8x8(int64_t src0, int64_t src1) __HC__;
2572 
2573 extern "C" int __unpacklo_s16x2(int src0, int src1) __HC__;
2574 
2575 extern "C" int64_t __unpacklo_s16x4(int64_t src0, int64_t src1) __HC__;
2576 
2577 extern "C" int64_t __unpacklo_s32x2(int64_t src0, int64_t src1) __HC__;
2587 extern "C" unsigned int __unpackhi_u8x4(unsigned int src0, unsigned int src1) __HC__;
2588 
2589 extern "C" uint64_t __unpackhi_u8x8(uint64_t src0, uint64_t src1) __HC__;
2590 
2591 extern "C" unsigned int __unpackhi_u16x2(unsigned int src0, unsigned int src1) __HC__;
2592 
2593 extern "C" uint64_t __unpackhi_u16x4(uint64_t src0, uint64_t src1) __HC__;
2594 
2595 extern "C" uint64_t __unpackhi_u32x2(uint64_t src0, uint64_t src1) __HC__;
2596 
2597 extern "C" int __unpackhi_s8x4(int src0, int src1) __HC__;
2598 
2599 extern "C" int64_t __unpackhi_s8x8(int64_t src0, int64_t src1) __HC__;
2600 
2601 extern "C" int __unpackhi_s16x2(int src0, int src1) __HC__;
2602 
2603 extern "C" int64_t __unpackhi_s16x4(int64_t src0, int64_t src1) __HC__;
2604 
2605 extern "C" int64_t __unpackhi_s32x2(int64_t src0, int64_t src1) __HC__;
2615 extern "C" unsigned int __pack_u8x4_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
2616 
2617 extern "C" uint64_t __pack_u8x8_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;
2618 
2619 extern "C" unsigned __pack_u16x2_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
2620 
2621 extern "C" uint64_t __pack_u16x4_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;
2622 
2623 extern "C" uint64_t __pack_u32x2_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;
2624 
2625 extern "C" int __pack_s8x4_s32(int src0, int src1, unsigned int src2) __HC__;
2626 
2627 extern "C" int64_t __pack_s8x8_s32(int64_t src0, int src1, unsigned int src2) __HC__;
2628 
2629 extern "C" int __pack_s16x2_s32(int src0, int src1, unsigned int src2) __HC__;
2630 
2631 extern "C" int64_t __pack_s16x4_s32(int64_t src0, int src1, unsigned int src2) __HC__;
2632 
2633 extern "C" int64_t __pack_s32x2_s32(int64_t src0, int src1, unsigned int src2) __HC__;
2634 
2635 extern "C" double __pack_f32x2_f32(double src0, float src1, unsigned int src2) __HC__;
2644 extern "C" unsigned int __unpack_u32_u8x4(unsigned int src0, unsigned int src1) __HC__;
2645 
2646 extern "C" unsigned int __unpack_u32_u8x8(uint64_t src0, unsigned int src1) __HC__;
2647 
2648 extern "C" unsigned int __unpack_u32_u16x2(unsigned int src0, unsigned int src1) __HC__;
2649 
2650 extern "C" unsigned int __unpack_u32_u16x4(uint64_t src0, unsigned int src1) __HC__;
2651 
2652 extern "C" unsigned int __unpack_u32_u32x2(uint64_t src0, unsigned int src1) __HC__;
2653 
2654 extern "C" int __unpack_s32_s8x4(int src0, unsigned int src1) __HC__;
2655 
2656 extern "C" int __unpack_s32_s8x8(int64_t src0, unsigned int src1) __HC__;
2657 
2658 extern "C" int __unpack_s32_s16x2(int src0, unsigned int src1) __HC__;
2659 
2660 extern "C" int __unpack_s32_s16x4(int64_t src0, unsigned int src1) __HC__;
2661 
2662 extern "C" int __unpack_s32_s3x2(int64_t src0, unsigned int src1) __HC__;
2663 
2664 extern "C" float __unpack_f32_f32x2(double src0, unsigned int src1) __HC__;
2672 extern "C" unsigned int __bitalign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
2673 
2679 extern "C" unsigned int __bytealign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
2680 
2687 extern "C" unsigned int __lerp_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
2688 
2695 extern "C" unsigned int __packcvt_u8x4_f32(float src0, float src1, float src2, float src3) __HC__;
2696 
2702 extern "C" float __unpackcvt_f32_u8x4(unsigned int src0, unsigned int src1) __HC__;
2703 
2711 extern "C" unsigned int __sad_u32_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
2712 
2713 extern "C" unsigned int __sad_u32_u16x2(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
2714 
2715 extern "C" unsigned int __sad_u32_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
2724 extern "C" unsigned int __sadhi_u16x2_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
2725 
2729 extern "C" uint64_t __clock_u64() __HC__;
2730 
2736 extern "C" uint64_t __cycle_u64() __HC__;
2737 
2744 extern "C" unsigned int __activelaneid_u32() __HC__;
2745 
2758 extern "C" uint64_t __activelanemask_v4_b64_b1(unsigned int input) __HC__;
2759 
2768 extern "C" inline unsigned int __activelanecount_u32_b1(unsigned int input) __HC__ {
2770 }
2771 
2772 // ------------------------------------------------------------------------
2773 // Wavefront Vote Functions
2774 // ------------------------------------------------------------------------
2775 
2781 extern "C" inline int __any(int predicate) __HC__ {
2783 }
2784 
2790 extern "C" inline int __all(int predicate) __HC__ {
2792 }
2793 
2800 extern "C" inline uint64_t __ballot(int predicate) __HC__ {
2801  return __activelanemask_v4_b64_b1(predicate);
2802 }
2803 
2804 // ------------------------------------------------------------------------
2805 // Wavefront Shuffle Functions
2806 // ------------------------------------------------------------------------
2807 
2808 // utility union type
2809 union __u {
2810  int i;
2811  unsigned int u;
2812  float f;
2813 };
2814 
2835 #if __hcc_backend__==HCC_BACKEND_AMDGPU
2836 
2837 /*
2838  * FIXME: We need to add __builtin_amdgcn_mbcnt_{lo,hi} to clang and call
2839  * them here instead.
2840  */
2841 
2842 int __amdgcn_mbcnt_lo(int mask, int src) [[hc]] __asm("llvm.amdgcn.mbcnt.lo");
2843 int __amdgcn_mbcnt_hi(int mask, int src) [[hc]] __asm("llvm.amdgcn.mbcnt.hi");
2844 
2845 inline int __lane_id(void) [[hc]] {
2846  int lo = __amdgcn_mbcnt_lo(-1, 0);
2847  return __amdgcn_mbcnt_hi(-1, lo);
2848 }
2849 
2850 #endif
2851 
2852 #if __hcc_backend__==HCC_BACKEND_AMDGPU
2853 
2859 int __amdgcn_ds_bpermute(int index, int src) [[hc]] __asm("llvm.amdgcn.ds.bpermute");
2860 inline unsigned int __amdgcn_ds_bpermute(int index, unsigned int src) [[hc]] {
2861  __u tmp; tmp.u = src;
2862  tmp.i = __amdgcn_ds_bpermute(index, tmp.i);
2863  return tmp.u;
2864 }
2865 inline float __amdgcn_ds_bpermute(int index, float src) [[hc]] {
2866  __u tmp; tmp.f = src;
2867  tmp.i = __amdgcn_ds_bpermute(index, tmp.i);
2868  return tmp.f;
2869 }
2870 
2874 extern "C" int __amdgcn_ds_permute(int index, int src) [[hc]];
2875 inline unsigned int __amdgcn_ds_permute(int index, unsigned int src) [[hc]] {
2876  __u tmp; tmp.u = src;
2877  tmp.i = __amdgcn_ds_permute(index, tmp.i);
2878  return tmp.u;
2879 }
2880 inline float __amdgcn_ds_permute(int index, float src) [[hc]] {
2881  __u tmp; tmp.f = src;
2882  tmp.i = __amdgcn_ds_permute(index, tmp.i);
2883  return tmp.f;
2884 }
2885 
2886 
2890 extern "C" int __amdgcn_ds_swizzle(int src, int pattern) [[hc]];
2891 inline unsigned int __amdgcn_ds_swizzle(unsigned int src, int pattern) [[hc]] {
2892  __u tmp; tmp.u = src;
2893  tmp.i = __amdgcn_ds_swizzle(tmp.i, pattern);
2894  return tmp.u;
2895 }
2896 inline float __amdgcn_ds_swizzle(float src, int pattern) [[hc]] {
2897  __u tmp; tmp.f = src;
2898  tmp.i = __amdgcn_ds_swizzle(tmp.i, pattern);
2899  return tmp.f;
2900 }
2901 
2902 
2903 
2907 extern "C" int __amdgcn_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl) [[hc]];
2908 
2917 extern "C" int __amdgcn_wave_sr1(int src, bool bound_ctrl) [[hc]];
2918 inline unsigned int __amdgcn_wave_sr1(unsigned int src, bool bound_ctrl) [[hc]] {
2919  __u tmp; tmp.u = src;
2920  tmp.i = __amdgcn_wave_sr1(tmp.i, bound_ctrl);
2921  return tmp.u;
2922 }
2923 inline float __amdgcn_wave_sr1(float src, bool bound_ctrl) [[hc]] {
2924  __u tmp; tmp.f = src;
2925  tmp.i = __amdgcn_wave_sr1(tmp.i, bound_ctrl);
2926  return tmp.f;
2927 }
2928 
2937 extern "C" int __amdgcn_wave_sl1(int src, bool bound_ctrl) [[hc]];
2938 inline unsigned int __amdgcn_wave_sl1(unsigned int src, bool bound_ctrl) [[hc]] {
2939  __u tmp; tmp.u = src;
2940  tmp.i = __amdgcn_wave_sl1(tmp.i, bound_ctrl);
2941  return tmp.u;
2942 }
2943 inline float __amdgcn_wave_sl1(float src, bool bound_ctrl) [[hc]] {
2944  __u tmp; tmp.f = src;
2945  tmp.i = __amdgcn_wave_sl1(tmp.i, bound_ctrl);
2946  return tmp.f;
2947 }
2948 
2949 
2957 extern "C" int __amdgcn_wave_rr1(int src) [[hc]];
2958 inline unsigned int __amdgcn_wave_rr1(unsigned int src) [[hc]] {
2959  __u tmp; tmp.u = src;
2960  tmp.i = __amdgcn_wave_rr1(tmp.i);
2961  return tmp.u;
2962 }
2963 inline float __amdgcn_wave_rr1(float src) [[hc]] {
2964  __u tmp; tmp.f = src;
2965  tmp.i = __amdgcn_wave_rr1(tmp.i);
2966  return tmp.f;
2967 }
2968 
2976 extern "C" int __amdgcn_wave_rl1(int src) [[hc]];
2977 inline unsigned int __amdgcn_wave_rl1(unsigned int src) [[hc]] {
2978  __u tmp; tmp.u = src;
2979  tmp.i = __amdgcn_wave_rl1(tmp.i);
2980  return tmp.u;
2981 }
2982 inline float __amdgcn_wave_rl1(float src) [[hc]] {
2983  __u tmp; tmp.f = src;
2984  tmp.i = __amdgcn_wave_rl1(tmp.i);
2985  return tmp.f;
2986 }
2987 
2988 #endif
2989 
2990 /* definition to expand macro then apply to pragma message
2991 #define VALUE_TO_STRING(x) #x
2992 #define VALUE(x) VALUE_TO_STRING(x)
2993 #define VAR_NAME_VALUE(var) #var "=" VALUE(var)
2994 #pragma message(VAR_NAME_VALUE(__hcc_backend__))
2995 */
2996 
2997 #if __hcc_backend__==HCC_BACKEND_AMDGPU
2998 
2999 inline int __shfl(int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
3000  int self = __lane_id();
3001  int index = srcLane + (self & ~(width-1));
3002  return __amdgcn_ds_bpermute(index<<2, var);
3003 }
3004 
3005 #endif
3006 
3007 inline unsigned int __shfl(unsigned int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
3008  __u tmp; tmp.u = var;
3009  tmp.i = __shfl(tmp.i, srcLane, width);
3010  return tmp.u;
3011 }
3012 
3013 
3014 inline float __shfl(float var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
3015  __u tmp; tmp.f = var;
3016  tmp.i = __shfl(tmp.i, srcLane, width);
3017  return tmp.f;
3018 }
3019 
3020 // FIXME: support half type
3045 #if __hcc_backend__==HCC_BACKEND_AMDGPU
3046 
3047 inline int __shfl_up(int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
3048  int self = __lane_id();
3049  int index = self - delta;
3050  index = (index < (self & ~(width-1)))?self:index;
3051  return __amdgcn_ds_bpermute(index<<2, var);
3052 }
3053 
3054 #endif
3055 
3056 inline unsigned int __shfl_up(unsigned int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
3057  __u tmp; tmp.u = var;
3058  tmp.i = __shfl_up(tmp.i, delta, width);
3059  return tmp.u;
3060 }
3061 
3062 inline float __shfl_up(float var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
3063  __u tmp; tmp.f = var;
3064  tmp.i = __shfl_up(tmp.i, delta, width);
3065  return tmp.f;
3066 }
3067 
3068 // FIXME: support half type
3094 #if __hcc_backend__==HCC_BACKEND_AMDGPU
3095 
3096 inline int __shfl_down(int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
3097  int self = __lane_id();
3098  int index = self + delta;
3099  index = (int)((self&(width-1))+delta) >= width?self:index;
3100  return __amdgcn_ds_bpermute(index<<2, var);
3101 }
3102 
3103 #endif
3104 
3105 inline unsigned int __shfl_down(unsigned int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
3106  __u tmp; tmp.u = var;
3107  tmp.i = __shfl_down(tmp.i, delta, width);
3108  return tmp.u;
3109 }
3110 
3111 inline float __shfl_down(float var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
3112  __u tmp; tmp.f = var;
3113  tmp.i = __shfl_down(tmp.i, delta, width);
3114  return tmp.f;
3115 }
3116 
3117 
3118 // FIXME: support half type
3139 #if __hcc_backend__==HCC_BACKEND_AMDGPU
3140 
3141 
3142 inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
3143  int self = __lane_id();
3144  int index = self^laneMask;
3145  index = index >= ((self+width)&~(width-1))?self:index;
3146  return __amdgcn_ds_bpermute(index<<2, var);
3147 }
3148 
3149 #endif
3150 
3151 inline float __shfl_xor(float var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
3152  __u tmp; tmp.f = var;
3153  tmp.i = __shfl_xor(tmp.i, laneMask, width);
3154  return tmp.f;
3155 }
3156 
3157 // FIXME: support half type
3160 inline unsigned int __shfl_xor(unsigned int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
3161  __u tmp; tmp.u = var;
3162  tmp.i = __shfl_xor(tmp.i, laneMask, width);
3163  return tmp.u;
3164 }
3165 
3173 inline unsigned int __mul24(unsigned int x, unsigned int y) [[hc]] {
3174  return (x & 0x00FFFFFF) * (y & 0x00FFFFFF);
3175 }
3176 
3184 inline int __mul24(int x, int y) [[hc]] {
3185  return ((x << 8) >> 8) * ((y << 8) >> 8);
3186 }
3187 
3197 inline unsigned int __mad24(unsigned int x, unsigned int y, unsigned int z) [[hc]] {
3198  return __mul24(x,y) + z;
3199 }
3200 
3210 inline int __mad24(int x, int y, int z) [[hc]] {
3211  return __mul24(x,y) + z;
3212 }
3213 
3214 inline void abort() __HC__ {
3215  __builtin_trap();
3216 }
3217 
3218 // ------------------------------------------------------------------------
3219 // group segment
3220 // ------------------------------------------------------------------------
3221 
3229 extern "C" unsigned int get_group_segment_size() __HC__;
3230 
3236 extern "C" unsigned int get_static_group_segment_size() __HC__;
3237 
3241 extern "C" void* get_group_segment_base_pointer() __HC__;
3242 
3246 extern "C" void* get_dynamic_group_segment_base_pointer() __HC__;
3247 
3248 // ------------------------------------------------------------------------
3249 // utility class for tiled_barrier
3250 // ------------------------------------------------------------------------
3251 
3252 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3253 template <typename Ker, typename Ti>
3254 void bar_wrapper(Ker *f, Ti *t)
3255 {
3256  (*f)(*t);
3257 }
3258 
3259 struct barrier_t {
3260  std::unique_ptr<ucontext_t[]> ctx;
3261  int idx;
3262  barrier_t (int a) :
3263  ctx(new ucontext_t[a + 1]) {}
3264  template <typename Ti, typename Ker>
3265  void setctx(int x, char *stack, Ker& f, Ti* tidx, int S) {
3266  getcontext(&ctx[x]);
3267  ctx[x].uc_stack.ss_sp = stack;
3268  ctx[x].uc_stack.ss_size = S;
3269  ctx[x].uc_link = &ctx[x - 1];
3270  makecontext(&ctx[x], (void (*)(void))bar_wrapper<Ker, Ti>, 2, &f, tidx);
3271  }
3272  void swap(int a, int b) {
3273  swapcontext(&ctx[a], &ctx[b]);
3274  }
3275  void wait() __HC__ {
3276  --idx;
3277  swapcontext(&ctx[idx + 1], &ctx[idx]);
3278  }
3279 };
3280 #endif
3281 
3282 
3283 // ------------------------------------------------------------------------
3284 // tiled_barrier
3285 // ------------------------------------------------------------------------
3286 
3295 public:
3296 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3297  using pb_t = std::shared_ptr<barrier_t>;
3298  tile_barrier(pb_t pb) : pbar(pb) {}
3299 
3307  tile_barrier(const tile_barrier& other) __CPU__ __HC__ : pbar(other.pbar) {}
3308 #else
3309 
3317  tile_barrier(const tile_barrier& other) __CPU__ __HC__ {}
3318 #endif
3319 
3330  void wait() const __HC__ {
3331 #if __KALMAR_ACCELERATOR__ == 1
3332  wait_with_all_memory_fence();
3333 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3334  pbar->wait();
3335 #endif
3336  }
3337 
3347  void wait_with_all_memory_fence() const __HC__ {
3348 #if __KALMAR_ACCELERATOR__ == 1
3349  amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
3350 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3351  pbar->wait();
3352 #endif
3353  }
3354 
3364  void wait_with_global_memory_fence() const __HC__ {
3365 #if __KALMAR_ACCELERATOR__ == 1
3366  amp_barrier(CLK_GLOBAL_MEM_FENCE);
3367 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3368  pbar->wait();
3369 #endif
3370  }
3371 
3383 #if __KALMAR_ACCELERATOR__ == 1
3384  amp_barrier(CLK_LOCAL_MEM_FENCE);
3385 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3386  pbar->wait();
3387 #endif
3388  }
3389 
3390 private:
3391 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3392  tile_barrier() __CPU__ __HC__ = default;
3393  pb_t pbar;
3394 #else
3395  tile_barrier() __HC__ {}
3396 #endif
3397 
3398  template <int N> friend
3399  class tiled_index;
3400 };
3401 
3402 // ------------------------------------------------------------------------
3403 // other memory fences
3404 // ------------------------------------------------------------------------
3405 
3411 // FIXME: this functions has not been implemented.
3412 void all_memory_fence(const tile_barrier&) __HC__;
3413 
3419 // FIXME: this functions has not been implemented.
3420 void global_memory_fence(const tile_barrier&) __HC__;
3421 
3427 // FIXME: this functions has not been implemented.
3428 void tile_static_memory_fence(const tile_barrier&) __HC__;
3429 
3430 // ------------------------------------------------------------------------
3431 // tiled_index
3432 // ------------------------------------------------------------------------
3433 
3440 template <int N=3>
3442 public:
3447  static const int rank = 3;
3448 
3456  tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}
3457 
3463 
3469 
3475 
3481 
3486 
3491 
3497  operator const index<3>() const __CPU__ __HC__ {
3498  return global;
3499  }
3500 
3501  tiled_index(const index<3>& g) __CPU__ __HC__ : global(g) {}
3502 
3503 private:
3504 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3505  __attribute__((always_inline)) tiled_index(int a0, int a1, int a2, int b0, int b1, int b2, int c0, int c1, int c2, tile_barrier& pb, int D0, int D1, int D2) __CPU__ __HC__
3506  : global(a2, a1, a0), local(b2, b1, b0), tile(c2, c1, c0), tile_origin(a2 - b2, a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1, D2) {}
3507 #endif
3508 
3509  __attribute__((annotate("__cxxamp_opencl_index")))
3510 #if __KALMAR_ACCELERATOR__ == 1
3511  __attribute__((always_inline)) tiled_index() __HC__
3512  : global(index<3>(amp_get_global_id(2), amp_get_global_id(1), amp_get_global_id(0))),
3513  local(index<3>(amp_get_local_id(2), amp_get_local_id(1), amp_get_local_id(0))),
3514  tile(index<3>(amp_get_group_id(2), amp_get_group_id(1), amp_get_group_id(0))),
3515  tile_origin(index<3>(amp_get_global_id(2) - amp_get_local_id(2),
3516  amp_get_global_id(1) - amp_get_local_id(1),
3517  amp_get_global_id(0) - amp_get_local_id(0))),
3518  tile_dim(index<3>(amp_get_local_size(2), amp_get_local_size(1), amp_get_local_size(0)))
3519 #elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3520  __attribute__((always_inline)) tiled_index() __CPU__ __HC__
3521 #else
3522  __attribute__((always_inline)) tiled_index() __HC__
3523 #endif // __KALMAR_ACCELERATOR__
3524  {}
3525 
3526  template<typename Kernel> friend
3527  completion_future parallel_for_each(const accelerator_view&, const tiled_extent<N>&, const Kernel&);
3528 
3529 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3530  template<typename K> friend
3531  void partitioned_task_tile_3D(K const&, tiled_extent<3> const&, int);
3532 #endif
3533 };
3534 
3535 
3541 template<>
3542 class tiled_index<1> {
3543 public:
3548  static const int rank = 1;
3549 
3557  tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}
3558 
3564 
3570 
3576 
3582 
3587 
3592 
3598  operator const index<1>() const __CPU__ __HC__ {
3599  return global;
3600  }
3601 
3602  tiled_index(const index<1>& g) __CPU__ __HC__ : global(g) {}
3603 
3604 private:
3605 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3606  __attribute__((always_inline)) tiled_index(int a, int b, int c, tile_barrier& pb, int D0) __CPU__ __HC__
3607  : global(a), local(b), tile(c), tile_origin(a - b), barrier(pb), tile_dim(D0) {}
3608 #endif
3609 
3610  __attribute__((annotate("__cxxamp_opencl_index")))
3611 #if __KALMAR_ACCELERATOR__ == 1
3612  __attribute__((always_inline)) tiled_index() __HC__
3613  : global(index<1>(amp_get_global_id(0))),
3614  local(index<1>(amp_get_local_id(0))),
3615  tile(index<1>(amp_get_group_id(0))),
3616  tile_origin(index<1>(amp_get_global_id(0) - amp_get_local_id(0))),
3617  tile_dim(index<1>(amp_get_local_size(0)))
3618 #elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3619  __attribute__((always_inline)) tiled_index() __CPU__ __HC__
3620 #else
3621  __attribute__((always_inline)) tiled_index() __HC__
3622 #endif // __KALMAR_ACCELERATOR__
3623  {}
3624 
3625  template<typename Kernel> friend
3626  completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&);
3627 
3628 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3629  template<typename K> friend
3630  void partitioned_task_tile_1D(K const&, tiled_extent<1> const&, int);
3631 #endif
3632 };
3633 
3639 template<>
3640 class tiled_index<2> {
3641 public:
3646  static const int rank = 2;
3647 
3655  tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}
3656 
3662 
3668 
3674 
3680 
3685 
3690 
3696  operator const index<2>() const __CPU__ __HC__ {
3697  return global;
3698  }
3699 
3700  tiled_index(const index<2>& g) __CPU__ __HC__ : global(g) {}
3701 
3702 private:
3703 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3704  __attribute__((always_inline)) tiled_index(int a0, int a1, int b0, int b1, int c0, int c1, tile_barrier& pb, int D0, int D1) __CPU__ __HC__
3705  : global(a1, a0), local(b1, b0), tile(c1, c0), tile_origin(a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1) {}
3706 #endif
3707 
3708  __attribute__((annotate("__cxxamp_opencl_index")))
3709 #if __KALMAR_ACCELERATOR__ == 1
3710  __attribute__((always_inline)) tiled_index() __HC__
3711  : global(index<2>(amp_get_global_id(1), amp_get_global_id(0))),
3712  local(index<2>(amp_get_local_id(1), amp_get_local_id(0))),
3713  tile(index<2>(amp_get_group_id(1), amp_get_group_id(0))),
3714  tile_origin(index<2>(amp_get_global_id(1) - amp_get_local_id(1),
3715  amp_get_global_id(0) - amp_get_local_id(0))),
3716  tile_dim(index<2>(amp_get_local_size(1), amp_get_local_size(0)))
3717 #elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3718  __attribute__((always_inline)) tiled_index() __CPU__ __HC__
3719 #else
3720  __attribute__((always_inline)) tiled_index() __HC__
3721 #endif // __KALMAR_ACCELERATOR__
3722  {}
3723 
3724  template<typename Kernel> friend
3725  completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&);
3726 
3727 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3728  template<typename K> friend
3729  void partitioned_task_tile_2D(K const&, tiled_extent<2> const&, int);
3730 #endif
3731 };
3732 
3733 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
3734 #define SSIZE 1024 * 10
3735 template <int N, typename Kernel, int K>
3736 struct cpu_helper
3737 {
3738  static inline void call(const Kernel& k, index<K>& idx, const extent<K>& ext) __CPU__ __HC__ {
3739  int i;
3740  for (i = 0; i < ext[N]; ++i) {
3741  idx[N] = i;
3742  cpu_helper<N + 1, Kernel, K>::call(k, idx, ext);
3743  }
3744  }
3745 };
3746 template <typename Kernel, int K>
3747 struct cpu_helper<K, Kernel, K>
3748 {
3749  static inline void call(const Kernel& k, const index<K>& idx, const extent<K>& ext) __CPU__ __HC__ {
3750  (const_cast<Kernel&>(k))(idx);
3751  }
3752 };
3753 
3754 template <typename Kernel, int N>
3755 void partitioned_task(const Kernel& ker, const extent<N>& ext, int part) {
3756  index<N> idx;
3757  int start = ext[0] * part / Kalmar::NTHREAD;
3758  int end = ext[0] * (part + 1) / Kalmar::NTHREAD;
3759  for (int i = start; i < end; i++) {
3760  idx[0] = i;
3761  cpu_helper<1, Kernel, N>::call(ker, idx, ext);
3762  }
3763 }
3764 
3765 template <typename Kernel>
3766 void partitioned_task_tile_1D(Kernel const& f, tiled_extent<1> const& ext, int part) {
3767  int D0 = ext.tile_dim[0];
3768  int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
3769  int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
3770  int stride = end - start;
3771  if (stride == 0)
3772  return;
3773  char *stk = new char[D0 * SSIZE];
3774  tiled_index<1> *tidx = new tiled_index<1>[D0];
3775  tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0);
3776  tile_barrier tbar(hc_bar);
3777  for (int tx = start; tx < end; tx++) {
3778  int id = 0;
3779  char *sp = stk;
3780  tiled_index<1> *tip = tidx;
3781  for (int x = 0; x < D0; x++) {
3782  new (tip) tiled_index<1>(tx * D0 + x, x, tx, tbar, D0);
3783  hc_bar->setctx(++id, sp, f, tip, SSIZE);
3784  sp += SSIZE;
3785  ++tip;
3786  }
3787  hc_bar->idx = 0;
3788  while (hc_bar->idx == 0) {
3789  hc_bar->idx = id;
3790  hc_bar->swap(0, id);
3791  }
3792  }
3793  delete [] stk;
3794  delete [] tidx;
3795 }
3796 
3797 template <typename Kernel>
3798 void partitioned_task_tile_2D(Kernel const& f, tiled_extent<2> const& ext, int part) {
3799  int D0 = ext.tile_dim[0];
3800  int D1 = ext.tile_dim[1];
3801  int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
3802  int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
3803  int stride = end - start;
3804  if (stride == 0)
3805  return;
3806  char *stk = new char[D1 * D0 * SSIZE];
3807  tiled_index<2> *tidx = new tiled_index<2>[D0 * D1];
3808  tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0 * D1);
3809  tile_barrier tbar(hc_bar);
3810 
3811  for (int tx = 0; tx < ext[1] / D1; tx++)
3812  for (int ty = start; ty < end; ty++) {
3813  int id = 0;
3814  char *sp = stk;
3815  tiled_index<2> *tip = tidx;
3816  for (int x = 0; x < D1; x++)
3817  for (int y = 0; y < D0; y++) {
3818  new (tip) tiled_index<2>(D1 * tx + x, D0 * ty + y, x, y, tx, ty, tbar, D0, D1);
3819  hc_bar->setctx(++id, sp, f, tip, SSIZE);
3820  ++tip;
3821  sp += SSIZE;
3822  }
3823  hc_bar->idx = 0;
3824  while (hc_bar->idx == 0) {
3825  hc_bar->idx = id;
3826  hc_bar->swap(0, id);
3827  }
3828  }
3829  delete [] stk;
3830  delete [] tidx;
3831 }
3832 
3833 template <typename Kernel>
3834 void partitioned_task_tile_3D(Kernel const& f, tiled_extent<3> const& ext, int part) {
3835  int D0 = ext.tile_dim[0];
3836  int D1 = ext.tile_dim[1];
3837  int D2 = ext.tile_dim[2];
3838  int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
3839  int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
3840  int stride = end - start;
3841  if (stride == 0)
3842  return;
3843  char *stk = new char[D2 * D1 * D0 * SSIZE];
3844  tiled_index<3> *tidx = new tiled_index<3>[D0 * D1 * D2];
3845  tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0 * D1 * D2);
3846  tile_barrier tbar(hc_bar);
3847 
3848  for (int i = 0; i < ext[2] / D2; i++)
3849  for (int j = 0; j < ext[1] / D1; j++)
3850  for(int k = start; k < end; k++) {
3851  int id = 0;
3852  char *sp = stk;
3853  tiled_index<3> *tip = tidx;
3854  for (int x = 0; x < D2; x++)
3855  for (int y = 0; y < D1; y++)
3856  for (int z = 0; z < D0; z++) {
3857  new (tip) tiled_index<3>(D2 * i + x,
3858  D1 * j + y,
3859  D0 * k + z,
3860  x, y, z, i, j, k, tbar, D0, D1, D2);
3861  hc_bar->setctx(++id, sp, f, tip, SSIZE);
3862  ++tip;
3863  sp += SSIZE;
3864  }
3865  hc_bar->idx = 0;
3866  while (hc_bar->idx == 0) {
3867  hc_bar->idx = id;
3868  hc_bar->swap(0, id);
3869  }
3870  }
3871  delete [] stk;
3872  delete [] tidx;
3873 }
3874 
3875 template <typename Kernel, int N>
3876 completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
3877  extent<N> const& compute_domain)
3878 {
3879  Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
3880  for (int i = 0; i < Kalmar::NTHREAD; ++i)
3881  obj[i] = std::thread(partitioned_task<Kernel, N>, std::cref(f), std::cref(compute_domain), i);
3882  // FIXME wrap the above operation into the completion_future object
3883  return completion_future();
3884 }
3885 
3886 template <typename Kernel>
3887 completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
3888  tiled_extent<1> const& compute_domain)
3889 {
3890  Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
3891  for (int i = 0; i < Kalmar::NTHREAD; ++i)
3892  obj[i] = std::thread(partitioned_task_tile_1D<Kernel>,
3893  std::cref(f), std::cref(compute_domain), i);
3894  // FIXME wrap the above operation into the completion_future object
3895  return completion_future();
3896 }
3897 
3898 template <typename Kernel>
3899 completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
3900  tiled_extent<2> const& compute_domain)
3901 {
3902  Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
3903  for (int i = 0; i < Kalmar::NTHREAD; ++i)
3904  obj[i] = std::thread(partitioned_task_tile_2D<Kernel>,
3905  std::cref(f), std::cref(compute_domain), i);
3906  // FIXME wrap the above operation into the completion_future object
3907  return completion_future();
3908 }
3909 
3910 template <typename Kernel>
3911 completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
3912  tiled_extent<3> const& compute_domain)
3913 {
3914  Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
3915  for (int i = 0; i < Kalmar::NTHREAD; ++i)
3916  obj[i] = std::thread(partitioned_task_tile_3D<Kernel>,
3917  std::cref(f), std::cref(compute_domain), i);
3918  // FIXME wrap the above operation into the completion_future object
3919  return completion_future();
3920 }
3921 
3922 #endif
3923 
3924 // ------------------------------------------------------------------------
3925 // utility helper classes for array_view
3926 // ------------------------------------------------------------------------
3927 
3928 template <typename T, int N>
3930 {
3931  // array_view<T,N>, where N>1
3932  // array_view<T,N-1> operator[](int i) const __CPU__ __HC__
3933  static_assert(N > 1, "projection_helper is only supported on array_view with a rank of 2 or higher");
3934  typedef array_view<T, N - 1> result_type;
3935  static result_type project(array_view<T, N>& now, int stride) __CPU__ __HC__ {
3936  int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
3937  for (i = N - 1; i > 0; --i) {
3938  ext_o[i - 1] = now.extent[i];
3939  ext[i - 1] = now.extent_base[i];
3940  idx[i - 1] = now.index_base[i];
3941  }
3942  stride += now.index_base[0];
3943  extent<N - 1> ext_now(ext_o);
3944  extent<N - 1> ext_base(ext);
3945  index<N - 1> idx_base(idx);
3946  return result_type (now.cache, ext_now, ext_base, idx_base,
3947  now.offset + ext_base.size() * stride);
3948  }
3949  static result_type project(const array_view<T, N>& now, int stride) __CPU__ __HC__ {
3950  int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
3951  for (i = N - 1; i > 0; --i) {
3952  ext_o[i - 1] = now.extent[i];
3953  ext[i - 1] = now.extent_base[i];
3954  idx[i - 1] = now.index_base[i];
3955  }
3956  stride += now.index_base[0];
3957  extent<N - 1> ext_now(ext_o);
3958  extent<N - 1> ext_base(ext);
3959  index<N - 1> idx_base(idx);
3960  return result_type (now.cache, ext_now, ext_base, idx_base,
3961  now.offset + ext_base.size() * stride);
3962  }
3963 };
3964 
3965 template <typename T>
3966 struct projection_helper<T, 1>
3967 {
3968  // array_view<T,1>
3969  // T& operator[](int i) const __CPU__ __HC__;
3970  typedef T& result_type;
3971  static result_type project(array_view<T, 1>& now, int i) __CPU__ __HC__ {
3972 #if __KALMAR_ACCELERATOR__ != 1
3973  now.cache.get_cpu_access(true);
3974 #endif
3975  T *ptr = reinterpret_cast<T *>(now.cache.get() + i + now.offset + now.index_base[0]);
3976  return *ptr;
3977  }
3978  static result_type project(const array_view<T, 1>& now, int i) __CPU__ __HC__ {
3979 #if __KALMAR_ACCELERATOR__ != 1
3980  now.cache.get_cpu_access(true);
3981 #endif
3982  T *ptr = reinterpret_cast<T *>(now.cache.get() + i + now.offset + now.index_base[0]);
3983  return *ptr;
3984  }
3985 };
3986 
3987 template <typename T, int N>
3988 struct projection_helper<const T, N>
3989 {
3990  // array_view<T,N>, where N>1
3991  // array_view<const T,N-1> operator[](int i) const __CPU__ __HC__;
3992  static_assert(N > 1, "projection_helper is only supported on array_view with a rank of 2 or higher");
3993  typedef array_view<const T, N - 1> const_result_type;
3994  static const_result_type project(array_view<const T, N>& now, int stride) __CPU__ __HC__ {
3995  int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
3996  for (i = N - 1; i > 0; --i) {
3997  ext_o[i - 1] = now.extent[i];
3998  ext[i - 1] = now.extent_base[i];
3999  idx[i - 1] = now.index_base[i];
4000  }
4001  stride += now.index_base[0];
4002  extent<N - 1> ext_now(ext_o);
4003  extent<N - 1> ext_base(ext);
4004  index<N - 1> idx_base(idx);
4005  auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
4006  now.offset + ext_base.size() * stride);
4007  return ret;
4008  }
4009  static const_result_type project(const array_view<const T, N>& now, int stride) __CPU__ __HC__ {
4010  int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
4011  for (i = N - 1; i > 0; --i) {
4012  ext_o[i - 1] = now.extent[i];
4013  ext[i - 1] = now.extent_base[i];
4014  idx[i - 1] = now.index_base[i];
4015  }
4016  stride += now.index_base[0];
4017  extent<N - 1> ext_now(ext_o);
4018  extent<N - 1> ext_base(ext);
4019  index<N - 1> idx_base(idx);
4020  auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
4021  now.offset + ext_base.size() * stride);
4022  return ret;
4023  }
4024 };
4025 
4026 template <typename T>
4027 struct projection_helper<const T, 1>
4028 {
4029  // array_view<const T,1>
4030  // const T& operator[](int i) const __CPU__ __HC__;
4031  typedef const T& const_result_type;
4032  static const_result_type project(array_view<const T, 1>& now, int i) __CPU__ __HC__ {
4033 #if __KALMAR_ACCELERATOR__ != 1
4034  now.cache.get_cpu_access();
4035 #endif
4036  const T *ptr = reinterpret_cast<const T *>(now.cache.get() + i + now.offset + now.index_base[0]);
4037  return *ptr;
4038  }
4039  static const_result_type project(const array_view<const T, 1>& now, int i) __CPU__ __HC__ {
4040 #if __KALMAR_ACCELERATOR__ != 1
4041  now.cache.get_cpu_access();
4042 #endif
4043  const T *ptr = reinterpret_cast<const T *>(now.cache.get() + i + now.offset + now.index_base[0]);
4044  return *ptr;
4045  }
4046 };
4047 
4048 // ------------------------------------------------------------------------
4049 // utility helper classes for array_view
4050 // ------------------------------------------------------------------------
4051 
4052 template <typename T>
4054 {
4055 private:
4056  struct two {char __lx; char __lxx;};
4057  template <typename C> static char test(decltype(std::declval<C>().data()));
4058  template <typename C> static two test(...);
4059 public:
4060  static const bool value = sizeof(test<T>(0)) == 1;
4061 };
4062 
4063 template <typename T>
4065 {
4066 private:
4067  struct two {char __lx; char __lxx;};
4068  template <typename C> static char test(decltype(&C::size));
4069  template <typename C> static two test(...);
4070 public:
4071  static const bool value = sizeof(test<T>(0)) == 1;
4072 };
4073 
4074 template <typename T>
4076 {
4077  using _T = typename std::remove_reference<T>::type;
4078  static const bool value = __has_size<_T>::value && __has_data<_T>::value;
4079 };
4080 
4081 
4082 // ------------------------------------------------------------------------
4083 // utility helper classes for array
4084 // ------------------------------------------------------------------------
4085 
4086 template <typename T, int N>
4088 {
4089  // array<T,N>, where N>1
4090  // array_view<T,N-1> operator[](int i0) __CPU__ __HC__;
4091  // array_view<const T,N-1> operator[](int i0) const __CPU__ __HC__;
4092  static_assert(N > 1, "projection_helper is only supported on array with a rank of 2 or higher");
4093  typedef array_view<T, N - 1> result_type;
4094  typedef array_view<const T, N - 1> const_result_type;
4095  static result_type project(array<T, N>& now, int stride) __CPU__ __HC__ {
4096 #if __KALMAR_ACCELERATOR__ != 1
4097  if( stride < 0)
4098  throw runtime_exception("errorMsg_throw", 0);
4099 #endif
4100  int comp[N - 1], i;
4101  for (i = N - 1; i > 0; --i)
4102  comp[i - 1] = now.extent[i];
4103  extent<N - 1> ext(comp);
4104  int offset = ext.size() * stride;
4105 #if __KALMAR_ACCELERATOR__ != 1
4106  if( offset >= now.extent.size())
4107  throw runtime_exception("errorMsg_throw", 0);
4108 #endif
4109  return result_type(now.m_device, ext, ext, index<N - 1>(), offset);
4110  }
4111  static const_result_type project(const array<T, N>& now, int stride) __CPU__ __HC__ {
4112  int comp[N - 1], i;
4113  for (i = N - 1; i > 0; --i)
4114  comp[i - 1] = now.extent[i];
4115  extent<N - 1> ext(comp);
4116  int offset = ext.size() * stride;
4117  return const_result_type(now.m_device, ext, ext, index<N - 1>(), offset);
4118  }
4119 };
4120 
4121 template <typename T>
4123 {
4124  // array<T,1>
4125  // T& operator[](int i0) __CPU__ __HC__;
4126  // const T& operator[](int i0) const __CPU__ __HC__;
4127  typedef T& result_type;
4128  typedef const T& const_result_type;
4129  static result_type project(array<T, 1>& now, int i) __CPU__ __HC__ {
4130 #if __KALMAR_ACCELERATOR__ != 1
4131  now.m_device.synchronize(true);
4132 #endif
4133  T *ptr = reinterpret_cast<T *>(now.m_device.get() + i);
4134  return *ptr;
4135  }
4136  static const_result_type project(const array<T, 1>& now, int i) __CPU__ __HC__ {
4137 #if __KALMAR_ACCELERATOR__ != 1
4138  now.m_device.synchronize();
4139 #endif
4140  const T *ptr = reinterpret_cast<const T *>(now.m_device.get() + i);
4141  return *ptr;
4142  }
4143 };
4144 
4145 template <int N>
4146 const extent<N>& check(const extent<N>& ext)
4147 {
4148 #if __KALMAR_ACCELERATOR__ != 1
4149  for (int i = 0; i < N; i++)
4150  {
4151  if(ext[i] <=0)
4152  throw runtime_exception("errorMsg_throw", 0);
4153  }
4154 #endif
4155  return ext;
4156 }
4157 
4158 // ------------------------------------------------------------------------
4159 // forward declarations of copy routines used by array / array_view
4160 // ------------------------------------------------------------------------
4161 
4162 template <typename T, int N>
4163 void copy(const array_view<const T, N>& src, const array_view<T, N>& dest);
4164 
4165 template <typename T, int N>
4166 void copy(const array_view<T, N>& src, const array_view<T, N>& dest);
4167 
4168 template <typename T, int N>
4169 void copy(const array<T, N>& src, const array_view<T, N>& dest);
4170 
4171 template <typename T, int N>
4172 void copy(const array<T, N>& src, array<T, N>& dest);
4173 
4174 template <typename T, int N>
4175 void copy(const array_view<const T, N>& src, array<T, N>& dest);
4176 
4177 template <typename T, int N>
4178 void copy(const array_view<T, N>& src, array<T, N>& dest);
4179 
4180 template <typename InputIter, typename T, int N>
4181 void copy(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest);
4182 
4183 template <typename InputIter, typename T, int N>
4184 void copy(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest);
4185 
4186 template <typename InputIter, typename T, int N>
4187 void copy(InputIter srcBegin, const array_view<T, N>& dest);
4188 
4189 template <typename InputIter, typename T, int N>
4190 void copy(InputIter srcBegin, array<T, N>& dest);
4191 
4192 template <typename OutputIter, typename T, int N>
4193 void copy(const array_view<T, N> &src, OutputIter destBegin);
4194 
4195 template <typename OutputIter, typename T, int N>
4196 void copy(const array<T, N> &src, OutputIter destBegin);
4197 
4198 // ------------------------------------------------------------------------
4199 // array
4200 // ------------------------------------------------------------------------
4201 
4209 template <typename T, int N = 1>
4210 class array {
4211  static_assert(!std::is_const<T>::value, "array<const T> is not supported");
4212 public:
4213 #if __KALMAR_ACCELERATOR__ == 1
4214  typedef Kalmar::_data<T> acc_buffer_t;
4215 #else
4216  typedef Kalmar::_data_host<T> acc_buffer_t;
4217 #endif
4218 
4222  static const int rank = N;
4223 
4227  typedef T value_type;
4228 
4232  array() = delete;
4233 
4242  array(const array& other)
4243  : array(other.get_extent(), other.get_accelerator_view())
4244  { copy(other, *this); }
4245 
4253  array(array&& other)
4254  : m_device(other.m_device), extent(other.extent)
4255  { other.m_device.reset(); }
4256 
4264  explicit array(const extent<N>& ext)
4265  : array(ext, accelerator(L"default").get_default_view()) {}
4266 
4274  explicit array(int e0)
4275  : array(hc::extent<N>(e0)) { static_assert(N == 1, "illegal"); }
4276  explicit array(int e0, int e1)
4277  : array(hc::extent<N>(e0, e1)) {}
4278  explicit array(int e0, int e1, int e2)
4279  : array(hc::extent<N>(e0, e1, e2)) {}
4280 
4297  template <typename InputIter>
4298  array(const extent<N>& ext, InputIter srcBegin)
4299  : array(ext, srcBegin, accelerator(L"default").get_default_view()) {}
4300  template <typename InputIter>
4301  array(const extent<N>& ext, InputIter srcBegin, InputIter srcEnd)
4302  : array(ext, srcBegin, srcEnd, accelerator(L"default").get_default_view()) {}
4303 
4316  template <typename InputIter>
4317  array(int e0, InputIter srcBegin)
4318  : array(hc::extent<N>(e0), srcBegin) {}
4319  template <typename InputIter>
4320  array(int e0, InputIter srcBegin, InputIter srcEnd)
4321  : array(hc::extent<N>(e0), srcBegin, srcEnd) {}
4322  template <typename InputIter>
4323  array(int e0, int e1, InputIter srcBegin)
4324  : array(hc::extent<N>(e0, e1), srcBegin) {}
4325  template <typename InputIter>
4326  array(int e0, int e1, InputIter srcBegin, InputIter srcEnd)
4327  : array(hc::extent<N>(e0, e1), srcBegin, srcEnd) {}
4328  template <typename InputIter>
4329  array(int e0, int e1, int e2, InputIter srcBegin)
4330  : array(hc::extent<N>(e0, e1, e2), srcBegin) {}
4331  template <typename InputIter>
4332  array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd)
4333  : array(hc::extent<N>(e0, e1, e2), srcBegin, srcEnd) {}
4334 
4348  explicit array(const array_view<const T, N>& src)
4349  : array(src.get_extent(), accelerator(L"default").get_default_view())
4350  { copy(src, *this); }
4351 
4374  array(const extent<N>& ext, accelerator_view av, access_type cpu_access_type = access_type_auto)
4375 #if __KALMAR_ACCELERATOR__ == 1
4376  : m_device(ext.size()), extent(ext) {}
4377 #else
4378  : m_device(av.pQueue, av.pQueue, check(ext).size(), cpu_access_type), extent(ext) {}
4379 #endif
4380 
4385  explicit array(int e0, void* accelerator_pointer)
4386  : array(hc::extent<N>(e0), accelerator(L"default").get_default_view(), accelerator_pointer) {}
4387  explicit array(int e0, int e1, void* accelerator_pointer)
4388  : array(hc::extent<N>(e0, e1), accelerator(L"default").get_default_view(), accelerator_pointer) {}
4389  explicit array(int e0, int e1, int e2, void* accelerator_pointer)
4390  : array(hc::extent<N>(e0, e1, e2), accelerator(L"default").get_default_view(), accelerator_pointer) {}
4391 
4392  explicit array(const extent<N>& ext, void* accelerator_pointer)
4393  : array(ext, accelerator(L"default").get_default_view(), accelerator_pointer) {}
4405  explicit array(const extent<N>& ext, accelerator_view av, void* accelerator_pointer, access_type cpu_access_type = access_type_auto)
4406 #if __KALMAR_ACCELERATOR__ == 1
4407  : m_device(ext.size(), accelerator_pointer), extent(ext) {}
4408 #else
4409  : m_device(av.pQueue, av.pQueue, check(ext).size(), accelerator_pointer, cpu_access_type), extent(ext) {}
4410 #endif
4411 
4423  array(int e0, accelerator_view av, access_type cpu_access_type = access_type_auto)
4424  : array(hc::extent<N>(e0), av, cpu_access_type) {}
4425  array(int e0, int e1, accelerator_view av, access_type cpu_access_type = access_type_auto)
4426  : array(hc::extent<N>(e0, e1), av, cpu_access_type) {}
4427  array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type = access_type_auto)
4428  : array(hc::extent<N>(e0, e1, e2), av, cpu_access_type) {}
4429 
4459  template <typename InputIter>
4460  array(const extent<N>& ext, InputIter srcBegin, accelerator_view av,
4461  access_type cpu_access_type = access_type_auto)
4462  : array(ext, av, cpu_access_type) { copy(srcBegin, *this); }
4463  template <typename InputIter>
4464  array(const extent<N>& ext, InputIter srcBegin, InputIter srcEnd,
4465  accelerator_view av, access_type cpu_access_type = access_type_auto)
4466  : array(ext, av, cpu_access_type) {
4467  if (ext.size() < std::distance(srcBegin, srcEnd))
4468  throw runtime_exception("errorMsg_throw", 0);
4469  copy(srcBegin, srcEnd, *this);
4470  }
4471 
4500  array(const array_view<const T, N>& src, accelerator_view av, access_type cpu_access_type = access_type_auto)
4501  : array(src.get_extent(), av, cpu_access_type) { copy(src, *this); }
4502 
4516  template <typename InputIter>
4517  array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto)
4518  : array(extent<N>(e0), srcBegin, av, cpu_access_type) {}
4519  template <typename InputIter>
4520  array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto)
4521  : array(extent<N>(e0), srcBegin, srcEnd, av, cpu_access_type) {}
4522  template <typename InputIter>
4523  array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto)
4524  : array(hc::extent<N>(e0, e1), srcBegin, av, cpu_access_type) {}
4525  template <typename InputIter>
4526  array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto)
4527  : array(hc::extent<N>(e0, e1), srcBegin, srcEnd, av, cpu_access_type) {}
4528  template <typename InputIter>
4529  array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto)
4530  : array(hc::extent<N>(e0, e1, e2), srcBegin, av, cpu_access_type) {}
4531  template <typename InputIter>
4532  array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto)
4533  : array(hc::extent<N>(e0, e1, e2), srcBegin, srcEnd, av, cpu_access_type) {}
4534 
4549  array(const extent<N>& ext, accelerator_view av, accelerator_view associated_av)
4550 #if __KALMAR_ACCELERATOR__ == 1
4551  : m_device(ext.size()), extent(ext) {}
4552 #else
4553  : m_device(av.pQueue, associated_av.pQueue, check(ext).size(), access_type_auto), extent(ext) {}
4554 #endif
4555 
4568  array(int e0, accelerator_view av, accelerator_view associated_av)
4569  : array(hc::extent<N>(e0), av, associated_av) {}
4570  array(int e0, int e1, accelerator_view av, accelerator_view associated_av)
4571  : array(hc::extent<N>(e0, e1), av, associated_av) {}
4572  array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av)
4573  : array(hc::extent<N>(e0, e1, e2), av, associated_av) {}
4574 
4592  template <typename InputIter>
4593  array(const extent<N>& ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
4594  : array(ext, av, associated_av) { copy(srcBegin, *this); }
4595  template <typename InputIter>
4596  array(const extent<N>& ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
4597  : array(ext, av, associated_av) {
4598  if (ext.size() < std::distance(srcBegin, srcEnd))
4599  throw runtime_exception("errorMsg_throw", 0);
4600  copy(srcBegin, srcEnd, *this);
4601  }
4602 
4622  : array(src.get_extent(), av, associated_av)
4623  { copy(src, *this); }
4624 
4639  template <typename InputIter>
4640  array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
4641  : array(extent<N>(e0), srcBegin, av, associated_av) {}
4642  template <typename InputIter>
4643  array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
4644  : array(extent<N>(e0), srcBegin, srcEnd, av, associated_av) {}
4645  template <typename InputIter>
4646  array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
4647  : array(hc::extent<N>(e0, e1), srcBegin, av, associated_av) {}
4648  template <typename InputIter>
4649  array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
4650  : array(hc::extent<N>(e0, e1), srcBegin, srcEnd, av, associated_av) {}
4651  template <typename InputIter>
4652  array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
4653  : array(hc::extent<N>(e0, e1, e2), srcBegin, av, associated_av) {}
4654  template <typename InputIter>
4655  array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
4656  : array(hc::extent<N>(e0, e1, e2), srcBegin, srcEnd, av, associated_av) {}
4657 
4663  extent<N> get_extent() const __CPU__ __HC__ { return extent; }
4664 
4669  accelerator_view get_accelerator_view() const { return m_device.get_av(); }
4670 
4675  accelerator_view get_associated_accelerator_view() const { return m_device.get_stage(); }
4676 
4680  access_type get_cpu_access_type() const { return m_device.get_access(); }
4681 
4690  array& operator=(const array& other) {
4691  if (this != &other) {
4692  array arr(other);
4693  *this = std::move(arr);
4694  }
4695  return *this;
4696  }
4697 
4705  array& operator=(array&& other) {
4706  if (this != &other) {
4707  extent = other.extent;
4708  m_device = other.m_device;
4709  other.m_device.reset();
4710  }
4711  return *this;
4712  }
4713 
4723  array arr(src);
4724  *this = std::move(arr);
4725  return *this;
4726  }
4727 
4735  void copy_to(array& dest) const {
4736 #if __KALMAR_ACCELERATOR__ != 1
4737  for(int i = 0 ; i < N ; i++)
4738  {
4739  if (dest.extent[i] < this->extent[i] )
4740  throw runtime_exception("errorMsg_throw", 0);
4741  }
4742 #endif
4743  copy(*this, dest);
4744  }
4745 
4753  void copy_to(const array_view<T,N>& dest) const { copy(*this, dest); }
4754 
4760  T* data() const __CPU__ __HC__ {
4761 #if __KALMAR_ACCELERATOR__ != 1
4762  if (!m_device.get())
4763  return nullptr;
4764  m_device.synchronize(true);
4765 #endif
4766  return reinterpret_cast<T*>(m_device.get());
4767  }
4768 
4775  T* accelerator_pointer() const __CPU__ __HC__ {
4776  return reinterpret_cast<T*>(m_device.get_device_pointer());
4777  }
4778 
4786  operator std::vector<T>() const {
4787  std::vector<T> vec(extent.size());
4788  copy(*this, vec.data());
4789  return std::move(vec);
4790  }
4791 
4803  T& operator[](const index<N>& idx) __CPU__ __HC__ {
4804 #ifndef __KALMAR_ACCELERATOR__
4805  if (!m_device.get())
4806  throw runtime_exception("The array is not accessible on CPU.", 0);
4807  m_device.synchronize(true);
4808 #endif
4809  T *ptr = reinterpret_cast<T*>(m_device.get());
4810  return ptr[Kalmar::amp_helper<N, index<N>, hc::extent<N>>::flatten(idx, extent)];
4811  }
4812  T& operator()(const index<N>& idx) __CPU__ __HC__ {
4813  return (*this)[idx];
4814  }
4815 
4829  const T& operator[](const index<N>& idx) const __CPU__ __HC__ {
4830 #if __KALMAR_ACCELERATOR__ != 1
4831  if (!m_device.get())
4832  throw runtime_exception("The array is not accessible on CPU.", 0);
4833  m_device.synchronize();
4834 #endif
4835  T *ptr = reinterpret_cast<T*>(m_device.get());
4836  return ptr[Kalmar::amp_helper<N, index<N>, hc::extent<N>>::flatten(idx, extent)];
4837  }
4838  const T& operator()(const index<N>& idx) const __CPU__ __HC__ {
4839  return (*this)[idx];
4840  }
4841 
4852  T& operator()(int i0, int i1) __CPU__ __HC__ {
4853  return (*this)[index<2>(i0, i1)];
4854  }
4855  T& operator()(int i0, int i1, int i2) __CPU__ __HC__ {
4856  return (*this)[index<3>(i0, i1, i2)];
4857  }
4858 
4869  const T& operator()(int i0, int i1) const __CPU__ __HC__ {
4870  return (*this)[index<2>(i0, i1)];
4871  }
4872  const T& operator()(int i0, int i1, int i2) const __CPU__ __HC__ {
4873  return (*this)[index<3>(i0, i1, i2)];
4874  }
4875 
4894  operator[] (int i) __CPU__ __HC__ {
4895  return array_projection_helper<T, N>::project(*this, i);
4896  }
4898  operator()(int i0) __CPU__ __HC__ {
4899  return (*this)[i0];
4900  }
4902  operator[] (int i) const __CPU__ __HC__ {
4903  return array_projection_helper<T, N>::project(*this, i);
4904  }
4906  operator()(int i0) const __CPU__ __HC__ {
4907  return (*this)[i0];
4908  }
4909 
4930  array_view<T, N> section(const index<N>& origin, const extent<N>& ext) __CPU__ __HC__ {
4931 #if __KALMAR_ACCELERATOR__ != 1
4932  if ( !Kalmar::amp_helper<N, index<N>, hc::extent<N>>::contains(origin, ext ,this->extent) )
4933  throw runtime_exception("errorMsg_throw", 0);
4934 #endif
4935  array_view<T, N> av(*this);
4936  return av.section(origin, ext);
4937  }
4938  array_view<const T, N> section(const index<N>& origin, const extent<N>& ext) const __CPU__ __HC__ {
4939  array_view<const T, N> av(*this);
4940  return av.section(origin, ext);
4941  }
4942 
4949  array_view<T, N> section(const index<N>& idx) __CPU__ __HC__ {
4950 #if __KALMAR_ACCELERATOR__ != 1
4951  if ( !Kalmar::amp_helper<N, index<N>, hc::extent<N>>::contains(idx, this->extent ) )
4952  throw runtime_exception("errorMsg_throw", 0);
4953 #endif
4954  array_view<T, N> av(*this);
4955  return av.section(idx);
4956  }
4957  array_view<const T, N> section(const index<N>& idx) const __CPU__ __HC__ {
4958  array_view<const T, N> av(*this);
4959  return av.section(idx);
4960  }
4961 
4968  array_view<T,N> section(const extent<N>& ext) __CPU__ __HC__ {
4969  array_view<T, N> av(*this);
4970  return av.section(ext);
4971  }
4972  array_view<const T,N> section(const extent<N>& ext) const __CPU__ __HC__ {
4973  array_view<const T, N> av(*this);
4974  return av.section(ext);
4975  }
4976 
4989  array_view<T, 1> section(int i0, int e0) __CPU__ __HC__ {
4990  static_assert(N == 1, "Rank must be 1");
4991  return section(index<1>(i0), hc::extent<1>(e0));
4992  }
4993  array_view<const T, 1> section(int i0, int e0) const __CPU__ __HC__ {
4994  static_assert(N == 1, "Rank must be 1");
4995  return section(index<1>(i0), hc::extent<1>(e0));
4996  }
4997  array_view<T, 2> section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ {
4998  static_assert(N == 2, "Rank must be 2");
4999  return section(index<2>(i0, i1), hc::extent<2>(e0, e1));
5000  }
5001  array_view<T, 2> section(int i0, int i1, int e0, int e1) __CPU__ __HC__ {
5002  static_assert(N == 2, "Rank must be 2");
5003  return section(index<2>(i0, i1), hc::extent<2>(e0, e1));
5004  }
5005  array_view<T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) __CPU__ __HC__ {
5006  static_assert(N == 3, "Rank must be 3");
5007  return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2));
5008  }
5009  array_view<const T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ {
5010  static_assert(N == 3, "Rank must be 3");
5011  return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2));
5012  }
5013 
5037  template <typename ElementType>
5039 #if __KALMAR_ACCELERATOR__ != 1
5040  static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
5041  static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
5042  if( (extent.size() * sizeof(T)) % sizeof(ElementType))
5043  throw runtime_exception("errorMsg_throw", 0);
5044 #endif
5045  int size = extent.size() * sizeof(T) / sizeof(ElementType);
5046  using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
5047  array_view<ElementType, 1> av(buffer_type(m_device), extent<1>(size), 0);
5048  return av;
5049  }
5050  template <typename ElementType>
5052 #if __KALMAR_ACCELERATOR__ != 1
5053  static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
5054  static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
5055 #endif
5056  int size = extent.size() * sizeof(T) / sizeof(ElementType);
5057  using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
5058  array_view<const ElementType, 1> av(buffer_type(m_device), extent<1>(size), 0);
5059  return av;
5060  }
5061 
5077  template <int K> array_view<T, K>
5078  view_as(const extent<K>& viewExtent) __CPU__ __HC__ {
5079 #if __KALMAR_ACCELERATOR__ != 1
5080  if( viewExtent.size() > extent.size())
5081  throw runtime_exception("errorMsg_throw", 0);
5082 #endif
5083  array_view<T, K> av(m_device, viewExtent, 0);
5084  return av;
5085  }
5086  template <int K> array_view<const T, K>
5087  view_as(const extent<K>& viewExtent) const __CPU__ __HC__ {
5088 #if __KALMAR_ACCELERATOR__ != 1
5089  if( viewExtent.size() > extent.size())
5090  throw runtime_exception("errorMsg_throw", 0);
5091 #endif
5092  const array_view<T, K> av(m_device, viewExtent, 0);
5093  return av;
5094  }
5095 
5098  ~array() {}
5099 
5100  // FIXME: functions below may be considered to move to private
5101  const acc_buffer_t& internal() const __CPU__ __HC__ { return m_device; }
5102  int get_offset() const __CPU__ __HC__ { return 0; }
5103  index<N> get_index_base() const __CPU__ __HC__ { return index<N>(); }
5104 private:
5105  template <typename K, int Q> friend struct projection_helper;
5106  template <typename K, int Q> friend struct array_projection_helper;
5107  acc_buffer_t m_device;
5108  extent<N> extent;
5109 
5110  template <typename Q, int K> friend
5111  void copy(const array<Q, K>&, const array_view<Q, K>&);
5112  template <typename Q, int K> friend
5113  void copy(const array_view<const Q, K>&, array<Q, K>&);
5114 };
5115 
5116 // ------------------------------------------------------------------------
5117 // array_view
5118 // ------------------------------------------------------------------------
5119 
5126 template <typename T, int N = 1>
5127 class array_view
5128 {
5129 public:
5130  typedef typename std::remove_const<T>::type nc_T;
5131 #if __KALMAR_ACCELERATOR__ == 1
5132  typedef Kalmar::_data<T> acc_buffer_t;
5133 #else
5134  typedef Kalmar::_data_host<T> acc_buffer_t;
5135 #endif
5136 
5140  static const int rank = N;
5141 
5145  typedef T value_type;
5146 
5150  array_view() = delete;
5151 
5160  array_view(array<T, N>& src) __CPU__ __HC__
5161  : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {}
5162 
5163  // FIXME: following interfaces were not implemented yet
5164  // template <typename Container>
5165  // explicit array_view<T, 1>::array_view(Container& src);
5166  // template <typename value_type, int Size>
5167  // explicit array_view<T, 1>::array_view(value_type (&src) [Size]) __CPU__ __HC__;
5168 
5179  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5180  array_view(const extent<N>& extent, Container& src)
5181  : array_view(extent, src.data())
5182  { static_assert( std::is_same<decltype(src.data()), T*>::value, "container element type and array view element type must match"); }
5183 
5194  array_view(const extent<N>& ext, value_type* src) __CPU__ __HC__
5195 #if __KALMAR_ACCELERATOR__ == 1
5196  : cache((T *)(src)), extent(ext), extent_base(ext), offset(0) {}
5197 #else
5198  : cache(ext.size(), (T *)(src)), extent(ext), extent_base(ext), offset(0) {}
5199 #endif
5200 
5211  explicit array_view(const extent<N>& ext)
5212  : cache(ext.size()), extent(ext), extent_base(ext), offset(0) {}
5213 
5224  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5225  array_view(int e0, Container& src)
5226  : array_view(hc::extent<N>(e0), src) {}
5227  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5228  array_view(int e0, int e1, Container& src)
5229  : array_view(hc::extent<N>(e0, e1), src) {}
5230  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5231  array_view(int e0, int e1, int e2, Container& src)
5232  : array_view(hc::extent<N>(e0, e1, e2), src) {}
5233 
5244  array_view(int e0, value_type *src) __CPU__ __HC__
5245  : array_view(hc::extent<N>(e0), src) {}
5246  array_view(int e0, int e1, value_type *src) __CPU__ __HC__
5247  : array_view(hc::extent<N>(e0, e1), src) {}
5248  array_view(int e0, int e1, int e2, value_type *src) __CPU__ __HC__
5249  : array_view(hc::extent<N>(e0, e1, e2), src) {}
5250 
5258  explicit array_view(int e0) : array_view(hc::extent<N>(e0)) {}
5259  explicit array_view(int e0, int e1)
5260  : array_view(hc::extent<N>(e0, e1)) {}
5261  explicit array_view(int e0, int e1, int e2)
5262  : array_view(hc::extent<N>(e0, e1, e2)) {}
5263 
5272  array_view(const array_view& other) __CPU__ __HC__
5273  : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
5274 
5278  extent<N> get_extent() const __CPU__ __HC__ { return extent; }
5279 
5289  accelerator_view get_source_accelerator_view() const { return cache.get_av(); }
5290 
5299  array_view& operator=(const array_view& other) __CPU__ __HC__ {
5300  if (this != &other) {
5301  cache = other.cache;
5302  extent = other.extent;
5303  index_base = other.index_base;
5304  extent_base = other.extent_base;
5305  offset = other.offset;
5306  }
5307  return *this;
5308  }
5309 
5317  void copy_to(array<T,N>& dest) const {
5318 #if __KALMAR_ACCELERATOR__ != 1
5319  for(int i= 0 ;i< N;i++)
5320  {
5321  if (dest.get_extent()[i] < this->extent[i])
5322  throw runtime_exception("errorMsg_throw", 0);
5323  }
5324 #endif
5325  copy(*this, dest);
5326  }
5327 
5335  void copy_to(const array_view& dest) const { copy(*this, dest); }
5336 
5352  T* data() const __CPU__ __HC__ {
5353 
5354 #if __KALMAR_ACCELERATOR__ != 1
5355  cache.get_cpu_access(true);
5356 #endif
5357  static_assert(N == 1, "data() is only permissible on array views of rank 1");
5358  return reinterpret_cast<T*>(cache.get() + offset + index_base[0]);
5359  }
5360 
5367  T* accelerator_pointer() const __CPU__ __HC__ {
5368  return reinterpret_cast<T*>(cache.get_device_pointer() + offset + index_base[0]);
5369  }
5370 
5376  void refresh() const { cache.refresh(); }
5377 
5411  // FIXME: type parameter is not implemented
5412  void synchronize() const { cache.get_cpu_access(); }
5413 
5424  // FIXME: type parameter is not implemented
5426  std::future<void> fut = std::async([&]() mutable { synchronize(); });
5427  return completion_future(fut.share());
5428  }
5429 
5463  // FIXME: type parameter is not implemented
5464  void synchronize_to(const accelerator_view& av) const {
5465 #if __KALMAR_ACCELERATOR__ != 1
5466  cache.sync_to(av.pQueue);
5467 #endif
5468  }
5469 
5485  // FIXME: this method is not implemented yet
5486  completion_future synchronize_to_async(const accelerator_view& av) const;
5487 
5495  void discard_data() const {
5496 #if __KALMAR_ACCELERATOR__ != 1
5497  cache.discard();
5498 #endif
5499  }
5500 
5509  T& operator[] (const index<N>& idx) const __CPU__ __HC__ {
5510 #if __KALMAR_ACCELERATOR__ != 1
5511  cache.get_cpu_access(true);
5512 #endif
5513  T *ptr = reinterpret_cast<T*>(cache.get() + offset);
5514  return ptr[Kalmar::amp_helper<N, index<N>, hc::extent<N>>::flatten(idx + index_base, extent_base)];
5515  }
5516 
5517  T& operator()(const index<N>& idx) const __CPU__ __HC__ {
5518  return (*this)[idx];
5519  }
5520 
5534  // FIXME: this method is not implemented
5535  T& get_ref(const index<N>& idx) const __CPU__ __HC__;
5536 
5545  T& operator() (int i0, int i1) const __CPU__ __HC__ {
5546  static_assert(N == 2, "T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
5547  return (*this)[index<2>(i0, i1)];
5548  }
5549  T& operator() (int i0, int i1, int i2) const __CPU__ __HC__ {
5550  static_assert(N == 3, "T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
5551  return (*this)[index<3>(i0, i1, i2)];
5552  }
5553 
5576  operator[] (int i) const __CPU__ __HC__ {
5577  return projection_helper<T, N>::project(*this, i);
5578  }
5580  operator() (int i0) const __CPU__ __HC__ { return (*this)[i0]; }
5581 
5603  const extent<N>& ext) const __CPU__ __HC__ {
5604 #if __KALMAR_ACCELERATOR__ != 1
5605  if ( !Kalmar::amp_helper<N, index<N>, hc::extent<N>>::contains(idx, ext,this->extent ) )
5606  throw runtime_exception("errorMsg_throw", 0);
5607 #endif
5608  array_view<T, N> av(cache, ext, extent_base, idx + index_base, offset);
5609  return av;
5610  }
5611 
5615  array_view<T, N> section(const index<N>& idx) const __CPU__ __HC__ {
5616  hc::extent<N> ext(extent);
5617  Kalmar::amp_helper<N, index<N>, hc::extent<N>>::minus(idx, ext);
5618  return section(idx, ext);
5619  }
5620 
5624  array_view<T, N> section(const extent<N>& ext) const __CPU__ __HC__ {
5625  index<N> idx;
5626  return section(idx, ext);
5627  }
5628 
5639  array_view<T, 1> section(int i0, int e0) const __CPU__ __HC__ {
5640  static_assert(N == 1, "Rank must be 1");
5641  return section(index<1>(i0), hc::extent<1>(e0));
5642  }
5643 
5644  array_view<T, 2> section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ {
5645  static_assert(N == 2, "Rank must be 2");
5646  return section(index<2>(i0, i1), hc::extent<2>(e0, e1));
5647  }
5648 
5649  array_view<T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ {
5650  static_assert(N == 3, "Rank must be 3");
5651  return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2));
5652  }
5653 
5667  template <typename ElementType>
5669  static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1");
5670 #if __KALMAR_ACCELERATOR__ != 1
5671  static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
5672  static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
5673  if ( (extent.size() * sizeof(T)) % sizeof(ElementType))
5674  throw runtime_exception("errorMsg_throw", 0);
5675 #endif
5676  int size = extent.size() * sizeof(T) / sizeof(ElementType);
5677  using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
5678  array_view<ElementType, 1> av(buffer_type(cache),
5679  extent<1>(size),
5680  (offset + index_base[0])* sizeof(T) / sizeof(ElementType));
5681  return av;
5682  }
5683 
5692  template <int K>
5693  array_view<T, K> view_as(extent<K> viewExtent) const __CPU__ __HC__ {
5694  static_assert(N == 1, "view_as is only permissible on array views of rank 1");
5695 #if __KALMAR_ACCELERATOR__ != 1
5696  if ( viewExtent.size() > extent.size())
5697  throw runtime_exception("errorMsg_throw", 0);
5698 #endif
5699  array_view<T, K> av(cache, viewExtent, offset + index_base[0]);
5700  return av;
5701  }
5702 
5703  ~array_view() __CPU__ __HC__ {}
5704 
5705  // FIXME: the following functions could be considered to move to private
5706  const acc_buffer_t& internal() const __CPU__ __HC__ { return cache; }
5707 
5708  int get_offset() const __CPU__ __HC__ { return offset; }
5709 
5710  index<N> get_index_base() const __CPU__ __HC__ { return index_base; }
5711 
5712 private:
5713  template <typename K, int Q> friend struct projection_helper;
5714  template <typename K, int Q> friend struct array_projection_helper;
5715  template <typename Q, int K> friend class array;
5716  template <typename Q, int K> friend class array_view;
5717 
5718  template<typename Q, int K> friend
5719  bool is_flat(const array_view<Q, K>&) noexcept;
5720  template <typename Q, int K> friend
5721  void copy(const array<Q, K>&, const array_view<Q, K>&);
5722  template <typename InputIter, typename Q, int K> friend
5723  void copy(InputIter, InputIter, const array_view<Q, K>&);
5724  template <typename Q, int K> friend
5725  void copy(const array_view<const Q, K>&, array<Q, K>&);
5726  template <typename OutputIter, typename Q, int K> friend
5727  void copy(const array_view<Q, K>&, OutputIter);
5728  template <typename Q, int K> friend
5729  void copy(const array_view<const Q, K>& src, const array_view<Q, K>& dest);
5730 
5731  // used by view_as and reinterpret_as
5732  array_view(const acc_buffer_t& cache, const hc::extent<N>& ext,
5733  int offset) __CPU__ __HC__
5734  : cache(cache), extent(ext), extent_base(ext), offset(offset) {}
5735 
5736  // used by section and projection
5737  array_view(const acc_buffer_t& cache, const hc::extent<N>& ext_now,
5738  const hc::extent<N>& ext_b,
5739  const index<N>& idx_b, int off) __CPU__ __HC__
5740  : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b),
5741  offset(off) {}
5742 
5743  acc_buffer_t cache;
5745  hc::extent<N> extent_base;
5746  index<N> index_base;
5747  int offset;
5748 };
5749 
5750 // ------------------------------------------------------------------------
5751 // array_view (read-only)
5752 // ------------------------------------------------------------------------
5753 
5761 template <typename T, int N>
5762 class array_view<const T, N>
5763 {
5764 public:
5765  typedef typename std::remove_const<T>::type nc_T;
5766 
5767 #if __KALMAR_ACCELERATOR__ == 1
5768  typedef Kalmar::_data<nc_T> acc_buffer_t;
5769 #else
5770  typedef Kalmar::_data_host<const T> acc_buffer_t;
5771 #endif
5772 
5776  static const int rank = N;
5777 
5781  typedef const T value_type;
5782 
5786  array_view() = delete;
5787 
5796  array_view(const array<T,N>& src) __CPU__ __HC__
5797  : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {}
5798 
5799  // FIXME: following interfaces were not implemented yet
5800  // template <typename Container>
5801  // explicit array_view<const T, 1>::array_view(const Container& src);
5802  // template <typename value_type, int Size>
5803  // explicit array_view<const T, 1>::array_view(const value_type (&src) [Size]) __CPU__ __HC__;
5804 
5815  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5816  array_view(const extent<N>& extent, const Container& src)
5817  : array_view(extent, src.data())
5818  { static_assert( std::is_same<typename std::remove_const<typename std::remove_reference<decltype(*src.data())>::type>::type, T>::value, "container element type and array view element type must match"); }
5819 
5830  array_view(const extent<N>& ext, const value_type* src) __CPU__ __HC__
5831 #if __KALMAR_ACCELERATOR__ == 1
5832  : cache((nc_T*)(src)), extent(ext), extent_base(ext), offset(0) {}
5833 #else
5834  : cache(ext.size(), src), extent(ext), extent_base(ext), offset(0) {}
5835 #endif
5836 
5847  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5848  array_view(int e0, Container& src) : array_view(hc::extent<1>(e0), src) {}
5849  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5850  array_view(int e0, int e1, Container& src)
5851  : array_view(hc::extent<N>(e0, e1), src) {}
5852  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5853  array_view(int e0, int e1, int e2, Container& src)
5854  : array_view(hc::extent<N>(e0, e1, e2), src) {}
5855 
5866  array_view(int e0, const value_type *src) __CPU__ __HC__
5867  : array_view(hc::extent<1>(e0), src) {}
5868  array_view(int e0, int e1, const value_type *src) __CPU__ __HC__
5869  : array_view(hc::extent<2>(e0, e1), src) {}
5870  array_view(int e0, int e1, int e2, const value_type *src) __CPU__ __HC__
5871  : array_view(hc::extent<3>(e0, e1, e2), src) {}
5872 
5881  array_view(const array_view<nc_T, N>& other) __CPU__ __HC__
5882  : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
5883 
5891  array_view(const array_view& other) __CPU__ __HC__
5892  : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
5893 
5897  extent<N> get_extent() const __CPU__ __HC__ { return extent; }
5898 
5908  accelerator_view get_source_accelerator_view() const { return cache.get_av(); }
5909 
5919  array_view& operator=(const array_view<T,N>& other) __CPU__ __HC__ {
5920  cache = other.cache;
5921  extent = other.extent;
5922  index_base = other.index_base;
5923  extent_base = other.extent_base;
5924  offset = other.offset;
5925  return *this;
5926  }
5927 
5928  array_view& operator=(const array_view& other) __CPU__ __HC__ {
5929  if (this != &other) {
5930  cache = other.cache;
5931  extent = other.extent;
5932  index_base = other.index_base;
5933  extent_base = other.extent_base;
5934  offset = other.offset;
5935  }
5936  return *this;
5937  }
5938 
5948  void copy_to(array<T,N>& dest) const { copy(*this, dest); }
5949 
5957  void copy_to(const array_view<T,N>& dest) const { copy(*this, dest); }
5958 
5974  const T* data() const __CPU__ __HC__ {
5975 #if __KALMAR_ACCELERATOR__ != 1
5976  cache.get_cpu_access();
5977 #endif
5978  static_assert(N == 1, "data() is only permissible on array views of rank 1");
5979  return reinterpret_cast<const T*>(cache.get() + offset + index_base[0]);
5980  }
5981 
5988  T* accelerator_pointer() const __CPU__ __HC__ {
5989  return reinterpret_cast<const T*>(cache.get_device_pointer() + offset + index_base[0]);
5990  }
5991 
5997  void refresh() const { cache.refresh(); }
5998 
6028  void synchronize() const { cache.get_cpu_access(); }
6029 
6041  std::future<void> fut = std::async([&]() mutable { synchronize(); });
6042  return completion_future(fut.share());
6043  }
6044 
6056  void synchronize_to(const accelerator_view& av) const {
6057 #if __KALMAR_ACCELERATOR__ != 1
6058  cache.sync_to(av.pQueue);
6059 #endif
6060  }
6061 
6077  // FIXME: this method is not implemented yet
6078  completion_future synchronize_to_async(const accelerator_view& av) const;
6079 
6088  const T& operator[](const index<N>& idx) const __CPU__ __HC__ {
6089 #if __KALMAR_ACCELERATOR__ != 1
6090  cache.get_cpu_access();
6091 #endif
6092  const T *ptr = reinterpret_cast<const T*>(cache.get() + offset);
6093  return ptr[Kalmar::amp_helper<N, index<N>, hc::extent<N>>::flatten(idx + index_base, extent_base)];
6094  }
6095  const T& operator()(const index<N>& idx) const __CPU__ __HC__ {
6096  return (*this)[idx];
6097  }
6098 
6112  // FIXME: this method is not implemented
6113  const T& get_ref(const index<N>& idx) const __CPU__ __HC__;
6114 
6123  const T& operator()(int i0) const __CPU__ __HC__ {
6124  static_assert(N == 1, "const T& array_view::operator()(int) is only permissible on array_view<T, 1>");
6125  return (*this)[index<1>(i0)];
6126  }
6127 
6128  const T& operator()(int i0, int i1) const __CPU__ __HC__ {
6129  static_assert(N == 2, "const T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
6130  return (*this)[index<2>(i0, i1)];
6131  }
6132  const T& operator()(int i0, int i1, int i2) const __CPU__ __HC__ {
6133  static_assert(N == 3, "const T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
6134  return (*this)[index<3>(i0, i1, i2)];
6135  }
6136 
6159  operator[] (int i) const __CPU__ __HC__ {
6160  return projection_helper<const T, N>::project(*this, i);
6161  }
6162 
6163  // FIXME: typename projection_helper<const T, N>::const_result_type
6164  // operator() (int i0) const __CPU__ __HC__
6165  // is not implemented
6166 
6188  const extent<N>& ext) const __CPU__ __HC__ {
6189  array_view<const T, N> av(cache, ext, extent_base, idx + index_base, offset);
6190  return av;
6191  }
6192 
6196  array_view<const T, N> section(const index<N>& idx) const __CPU__ __HC__ {
6197  hc::extent<N> ext(extent);
6198  Kalmar::amp_helper<N, index<N>, hc::extent<N>>::minus(idx, ext);
6199  return section(idx, ext);
6200  }
6201 
6205  array_view<const T, N> section(const extent<N>& ext) const __CPU__ __HC__ {
6206  index<N> idx;
6207  return section(idx, ext);
6208  }
6209 
6220  array_view<const T, 1> section(int i0, int e0) const __CPU__ __HC__ {
6221  static_assert(N == 1, "Rank must be 1");
6222  return section(index<1>(i0), hc::extent<1>(e0));
6223  }
6224 
6225  array_view<const T, 2> section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ {
6226  static_assert(N == 2, "Rank must be 2");
6227  return section(index<2>(i0, i1), hc::extent<2>(e0, e1));
6228  }
6229 
6230  array_view<const T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ {
6231  static_assert(N == 3, "Rank must be 3");
6232  return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2));
6233  }
6234 
6248  template <typename ElementType>
6250  static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1");
6251 #if __KALMAR_ACCELERATOR__ != 1
6252  static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
6253  static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
6254 #endif
6255  int size = extent.size() * sizeof(T) / sizeof(ElementType);
6256  using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
6257  array_view<const ElementType, 1> av(buffer_type(cache),
6258  extent<1>(size),
6259  (offset + index_base[0])* sizeof(T) / sizeof(ElementType));
6260  return av;
6261  }
6262 
6271  template <int K>
6272  array_view<const T, K> view_as(extent<K> viewExtent) const __CPU__ __HC__ {
6273  static_assert(N == 1, "view_as is only permissible on array views of rank 1");
6274 #if __KALMAR_ACCELERATOR__ != 1
6275  if ( viewExtent.size() > extent.size())
6276  throw runtime_exception("errorMsg_throw", 0);
6277 #endif
6278  array_view<const T, K> av(cache, viewExtent, offset + index_base[0]);
6279  return av;
6280  }
6281 
6282  ~array_view() __CPU__ __HC__ {}
6283 
6284  // FIXME: the following functions may be considered to move to private
6285  const acc_buffer_t& internal() const __CPU__ __HC__ { return cache; }
6286 
6287  int get_offset() const __CPU__ __HC__ { return offset; }
6288 
6289  index<N> get_index_base() const __CPU__ __HC__ { return index_base; }
6290 
6291 private:
6292  template <typename K, int Q> friend struct projection_helper;
6293  template <typename K, int Q> friend struct array_projection_helper;
6294  template <typename Q, int K> friend class array;
6295  template <typename Q, int K> friend class array_view;
6296 
6297  template<typename Q, int K> friend
6298  bool is_flat(const array_view<Q, K>&) noexcept;
6299  template <typename Q, int K> friend
6300  void copy(const array<Q, K>&, const array_view<Q, K>&);
6301  template <typename InputIter, typename Q, int K>
6302  void copy(InputIter, InputIter, const array_view<Q, K>&);
6303  template <typename Q, int K> friend
6304  void copy(const array_view<const Q, K>&, array<Q, K>&);
6305  template <typename OutputIter, typename Q, int K> friend
6306  void copy(const array_view<Q, K>&, OutputIter);
6307  template <typename Q, int K> friend
6308  void copy(const array_view<const Q, K>& src, const array_view<Q, K>& dest);
6309 
6310  // used by view_as and reinterpret_as
6311  array_view(const acc_buffer_t& cache, const hc::extent<N>& ext,
6312  int offset) __CPU__ __HC__
6313  : cache(cache), extent(ext), extent_base(ext), offset(offset) {}
6314 
6315  // used by section and projection
6316  array_view(const acc_buffer_t& cache, const hc::extent<N>& ext_now,
6317  const extent<N>& ext_b,
6318  const index<N>& idx_b, int off) __CPU__ __HC__
6319  : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b),
6320  offset(off) {}
6321 
6322  acc_buffer_t cache;
6324  hc::extent<N> extent_base;
6325  index<N> index_base;
6326  int offset;
6327 };
6328 
6329 // ------------------------------------------------------------------------
6330 // utility functions for copy
6331 // ------------------------------------------------------------------------
6332 
6333 template<typename T, int N>
6334 static inline bool is_flat(const array_view<T, N>& av) noexcept {
6335  return av.extent == av.extent_base && av.index_base == index<N>();
6336 }
6337 
6338 template<typename T>
6339 static inline bool is_flat(const array_view<T, 1>& av) noexcept { return true; }
6340 
6341 template <typename InputIter, typename T, int N, int dim>
6343 {
6344  void operator()(InputIter& It, T* ptr, const extent<N>& ext,
6345  const extent<N>& base, const index<N>& idx)
6346  {
6347  size_t stride = 1;
6348  for (int i = dim; i < N; i++)
6349  stride *= base[i];
6350  ptr += stride * idx[dim - 1];
6351  for (int i = 0; i < ext[dim - 1]; i++) {
6352  copy_input<InputIter, T, N, dim + 1>()(It, ptr, ext, base, idx);
6353  ptr += stride;
6354  }
6355  }
6356 };
6357 
6358 template <typename InputIter, typename T, int N>
6359 struct copy_input<InputIter, T, N, N>
6360 {
6361  void operator()(InputIter& It, T* ptr, const extent<N>& ext,
6362  const extent<N>& base, const index<N>& idx)
6363  {
6364  InputIter end = It;
6365  std::advance(end, ext[N - 1]);
6366  std::copy(It, end, ptr + idx[N - 1]);
6367  It = end;
6368  }
6369 };
6370 
6371 template <typename OutputIter, typename T, int N, int dim>
6373 {
6374  void operator()(const T* ptr, OutputIter& It, const extent<N>& ext,
6375  const extent<N>& base, const index<N>& idx)
6376  {
6377  size_t stride = 1;
6378  for (int i = dim; i < N; i++)
6379  stride *= base[i];
6380  ptr += stride * idx[dim - 1];
6381  for (int i = 0; i < ext[dim - 1]; i++) {
6382  copy_output<OutputIter, T, N, dim + 1>()(ptr, It, ext, base, idx);
6383  ptr += stride;
6384  }
6385  }
6386 };
6387 
6388 template <typename OutputIter, typename T, int N>
6389 struct copy_output<OutputIter, T, N, N>
6390 {
6391  void operator()(const T* ptr, OutputIter& It, const extent<N>& ext,
6392  const extent<N>& base, const index<N>& idx)
6393  {
6394  ptr += idx[N - 1];
6395  It = std::copy(ptr, ptr + ext[N - 1], It);
6396  }
6397 };
6398 
6399 template <typename T, int N, int dim>
6401 {
6402  void operator()(const T* src, T* dst, const extent<N>& ext,
6403  const extent<N>& base1, const index<N>& idx1,
6404  const extent<N>& base2, const index<N>& idx2)
6405  {
6406  size_t stride1 = 1;
6407  for (int i = dim; i < N; i++)
6408  stride1 *= base1[i];
6409  src += stride1 * idx1[dim - 1];
6410 
6411  size_t stride2 = 1;
6412  for (int i = dim; i < N; i++)
6413  stride2 *= base2[i];
6414  dst += stride2 * idx2[dim - 1];
6415 
6416  for (int i = 0; i < ext[dim - 1]; i++) {
6417  copy_bidir<T, N, dim + 1>()(src, dst, ext, base1, idx1, base2, idx2);
6418  src += stride1;
6419  dst += stride2;
6420  }
6421  }
6422 };
6423 
6424 template <typename T, int N>
6425 struct copy_bidir<T, N, N>
6426 {
6427  void operator()(const T* src, T* dst, const extent<N>& ext,
6428  const extent<N>& base1, const index<N>& idx1,
6429  const extent<N>& base2, const index<N>& idx2)
6430  {
6431  src += idx1[N - 1];
6432  dst += idx2[N - 1];
6433  std::copy(src, src + ext[N - 1], dst);
6434  }
6435 };
6436 
6437 template <typename Iter, typename T, int N>
6438 struct do_copy
6439 {
6440  template<template <typename, int> class _amp_container>
6441  void operator()(Iter srcBegin, Iter srcEnd, const _amp_container<T, N>& dest) {
6442  size_t size = dest.get_extent().size();
6443  size_t offset = dest.get_offset();
6444  bool modify = true;
6445 
6446  T* ptr = dest.internal().map_ptr(modify, size, offset);
6447  std::copy(srcBegin, srcEnd, ptr);
6448  dest.internal().unmap_ptr(ptr, modify, size, offset);
6449  }
6450  template<template <typename, int> class _amp_container>
6451  void operator()(const _amp_container<T, N> &src, Iter destBegin) {
6452  size_t size = src.get_extent().size();
6453  size_t offset = src.get_offset();
6454  bool modify = false;
6455 
6456  const T* ptr = src.internal().map_ptr(modify, size, offset);
6457  std::copy(ptr, ptr + src.get_extent().size(), destBegin);
6458  src.internal().unmap_ptr(ptr, modify, size, offset);
6459  }
6460 };
6461 
6462 template <typename Iter, typename T>
6463 struct do_copy<Iter, T, 1>
6464 {
6465  template<template <typename, int> class _amp_container>
6466  void operator()(Iter srcBegin, Iter srcEnd, const _amp_container<T, 1>& dest) {
6467  size_t size = dest.get_extent().size();
6468  size_t offset = dest.get_offset() + dest.get_index_base()[0];
6469  bool modify = true;
6470 
6471  T* ptr = dest.internal().map_ptr(modify, size, offset);
6472  std::copy(srcBegin, srcEnd, ptr);
6473  dest.internal().unmap_ptr(ptr, modify, size, offset);
6474  }
6475  template<template <typename, int> class _amp_container>
6476  void operator()(const _amp_container<T, 1> &src, Iter destBegin) {
6477  size_t size = src.get_extent().size();
6478  size_t offset = src.get_offset() + src.get_index_base()[0];
6479  bool modify = false;
6480 
6481  const T* ptr = src.internal().map_ptr(modify, size, offset);
6482  std::copy(ptr, ptr + src.get_extent().size(), destBegin);
6483  src.internal().unmap_ptr(ptr, modify, size, offset);
6484  }
6485 };
6486 
6487 template <typename T, int N>
6488 struct do_copy<T*, T, N>
6489 {
6490  template<template <typename, int> class _amp_container>
6491  void operator()(T* srcBegin, T* srcEnd, const _amp_container<T, N>& dest) {
6492  dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd), dest.get_offset(), true);
6493  }
6494  template<template <typename, int> class _amp_container>
6495  void operator()(const _amp_container<T, N> &src, T* destBegin) {
6496  src.internal().read(destBegin, src.get_extent().size(), src.get_offset());
6497  }
6498 };
6499 
6500 template <typename T>
6501 struct do_copy<T*, T, 1>
6502 {
6503  template<template <typename, int> class _amp_container>
6504  void operator()(const T* srcBegin, const T* srcEnd, const _amp_container<T, 1>& dest) {
6505  dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd),
6506  dest.get_offset() + dest.get_index_base()[0], true);
6507  }
6508  template<template <typename, int> class _amp_container>
6509  void operator()(const _amp_container<T, 1> &src, T* destBegin) {
6510  src.internal().read(destBegin, src.get_extent().size(),
6511  src.get_offset() + src.get_index_base()[0]);
6512  }
6513 };
6514 
6515 // ------------------------------------------------------------------------
6516 // copy
6517 // ------------------------------------------------------------------------
6518 
6527 template <typename T, int N>
6528 void copy(const array<T, N>& src, array<T, N>& dest) {
6529  src.internal().copy(dest.internal(), 0, 0, 0);
6530 }
6531 
6540 template <typename T, int N>
6541 void copy(const array<T, N>& src, const array_view<T, N>& dest) {
6542  if (is_flat(dest))
6543  src.internal().copy(dest.internal(), src.get_offset(),
6544  dest.get_offset(), dest.get_extent().size());
6545  else {
6546  // FIXME: logic here deserve to be reviewed
6547  size_t srcSize = src.extent.size();
6548  size_t srcOffset = 0;
6549  bool srcModify = false;
6550  size_t destSize = dest.extent_base.size();
6551  size_t destOffset = dest.offset;
6552  bool destModify = true;
6553 
6554  T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
6555  T* p = pSrc;
6556  T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
6557  copy_input<T*, T, N, 1>()(pSrc, pDst, dest.extent, dest.extent_base, dest.index_base);
6558  dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
6559  src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
6560  }
6561 }
6562 
6563 template <typename T>
6564 void copy(const array<T, 1>& src, const array_view<T, 1>& dest) {
6565  src.internal().copy(dest.internal(),
6566  src.get_offset() + src.get_index_base()[0],
6567  dest.get_offset() + dest.get_index_base()[0],
6568  dest.get_extent().size());
6569 }
6570 
6582 template <typename T, int N>
6583 void copy(const array_view<const T, N>& src, array<T, N>& dest) {
6584  if (is_flat(src)) {
6585  src.internal().copy(dest.internal(), src.get_offset(),
6586  dest.get_offset(), dest.get_extent().size());
6587  } else {
6588  // FIXME: logic here deserve to be reviewed
6589  size_t srcSize = src.extent_base.size();
6590  size_t srcOffset = src.offset;
6591  bool srcModify = false;
6592  size_t destSize = dest.extent.size();
6593  size_t destOffset = 0;
6594  bool destModify = true;
6595 
6596  T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
6597  T* p = pDst;
6598  const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
6599  copy_output<T*, T, N, 1>()(pSrc, pDst, src.extent, src.extent_base, src.index_base);
6600  src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
6601  dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
6602  }
6603 }
6604 
6605 template <typename T, int N>
6606 void copy(const array_view<T, N>& src, array<T, N>& dest) {
6607  const array_view<const T, N> buf(src);
6608  copy(buf, dest);
6609 }
6610 
6611 template <typename T>
6612 void copy(const array_view<const T, 1>& src, array<T, 1>& dest) {
6613  src.internal().copy(dest.internal(),
6614  src.get_offset() + src.get_index_base()[0],
6615  dest.get_offset() + dest.get_index_base()[0],
6616  dest.get_extent().size());
6617 }
6618 
6630 template <typename T, int N>
6631 void copy(const array_view<const T, N>& src, const array_view<T, N>& dest) {
6632  if (is_flat(src)) {
6633  if (is_flat(dest))
6634  src.internal().copy(dest.internal(), src.get_offset(),
6635  dest.get_offset(), dest.get_extent().size());
6636  else {
6637  // FIXME: logic here deserve to be reviewed
6638  size_t srcSize = src.extent.size();
6639  size_t srcOffset = 0;
6640  bool srcModify = false;
6641  size_t destSize = dest.extent_base.size();
6642  size_t destOffset = dest.offset;
6643  bool destModify = true;
6644 
6645  const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
6646  const T* p = pSrc;
6647  T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
6648  copy_input<const T*, T, N, 1>()(pSrc, pDst, dest.extent, dest.extent_base, dest.index_base);
6649  dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
6650  src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
6651  }
6652  } else {
6653  if (is_flat(dest)) {
6654  // FIXME: logic here deserve to be reviewed
6655  size_t srcSize = src.extent_base.size();
6656  size_t srcOffset = src.offset;
6657  bool srcModify = false;
6658  size_t destSize = dest.extent.size();
6659  size_t destOffset = 0;
6660  bool destModify = true;
6661 
6662  T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
6663  T* p = pDst;
6664  const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
6665  copy_output<T*, T, N, 1>()(pSrc, pDst, src.extent, src.extent_base, src.index_base);
6666  dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
6667  src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
6668  } else {
6669  // FIXME: logic here deserve to be reviewed
6670  size_t srcSize = src.extent_base.size();
6671  size_t srcOffset = src.offset;
6672  bool srcModify = false;
6673  size_t destSize = dest.extent_base.size();
6674  size_t destOffset = dest.offset;
6675  bool destModify = true;
6676 
6677  const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
6678  T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
6679  copy_bidir<T, N, 1>()(pSrc, pDst, src.extent, src.extent_base,
6680  src.index_base, dest.extent_base, dest.index_base);
6681  dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
6682  src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
6683  }
6684  }
6685 }
6686 
6687 template <typename T, int N>
6688 void copy(const array_view<T, N>& src, const array_view<T, N>& dest) {
6689  const array_view<const T, N> buf(src);
6690  copy(buf, dest);
6691 }
6692 
6693 template <typename T>
6694 void copy(const array_view<const T, 1>& src, const array_view<T, 1>& dest) {
6695  src.internal().copy(dest.internal(),
6696  src.get_offset() + src.get_index_base()[0],
6697  dest.get_offset() + dest.get_index_base()[0],
6698  dest.get_extent().size());
6699 }
6700 
6717 template <typename InputIter, typename T, int N>
6718 void copy(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest) {
6719 #if __KALMAR_ACCELERATOR__ != 1
6720  if( ( std::distance(srcBegin,srcEnd) <=0 )||( std::distance(srcBegin,srcEnd) < dest.get_extent().size() ))
6721  throw runtime_exception("errorMsg_throw ,copy between different types", 0);
6722 #endif
6723  do_copy<InputIter, T, N>()(srcBegin, srcEnd, dest);
6724 }
6725 
6726 template <typename InputIter, typename T, int N>
6727 void copy(InputIter srcBegin, array<T, N>& dest) {
6728  InputIter srcEnd = srcBegin;
6729  std::advance(srcEnd, dest.get_extent().size());
6730  copy(srcBegin, srcEnd, dest);
6731 }
6732 
6749 template <typename InputIter, typename T, int N>
6750 void copy(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest) {
6751  if (is_flat(dest))
6752  do_copy<InputIter, T, N>()(srcBegin, srcEnd, dest);
6753  else {
6754  size_t size = dest.extent_base.size();
6755  size_t offset = dest.offset;
6756  bool modify = true;
6757 
6758  T* ptr = dest.internal().map_ptr(modify, size, offset);
6759  copy_input<InputIter, T, N, 1>()(srcBegin, ptr, dest.extent, dest.extent_base, dest.index_base);
6760  dest.internal().unmap_ptr(ptr, modify, size, offset);
6761  }
6762 }
6763 
6764 template <typename InputIter, typename T, int N>
6765 void copy(InputIter srcBegin, const array_view<T, N>& dest) {
6766  InputIter srcEnd = srcBegin;
6767  std::advance(srcEnd, dest.get_extent().size());
6768  copy(srcBegin, srcEnd, dest);
6769 }
6770 
6783 template <typename OutputIter, typename T, int N>
6784 void copy(const array<T, N> &src, OutputIter destBegin) {
6785  do_copy<OutputIter, T, N>()(src, destBegin);
6786 }
6787 
6798 template <typename OutputIter, typename T, int N>
6799 void copy(const array_view<T, N> &src, OutputIter destBegin) {
6800  if (is_flat(src))
6801  do_copy<OutputIter, T, N>()(src, destBegin);
6802  else {
6803  size_t size = src.extent_base.size();
6804  size_t offset = src.offset;
6805  bool modify = false;
6806 
6807  T* ptr = src.internal().map_ptr(modify, size, offset);
6808  copy_output<OutputIter, T, N, 1>()(ptr, destBegin, src.extent, src.extent_base, src.index_base);
6809  src.internal().unmap_ptr(ptr, modify, size, offset);
6810  }
6811 }
6812 
6813 // ------------------------------------------------------------------------
6814 // utility function for copy_async
6815 // ------------------------------------------------------------------------
6816 
6817 
6818 // ------------------------------------------------------------------------
6819 // copy_async
6820 // ------------------------------------------------------------------------
6821 
6830 template <typename T, int N>
6831 completion_future copy_async(const array<T, N>& src, array<T, N>& dest) {
6832  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
6833  return completion_future(fut.share());
6834 }
6835 
6843 template <typename T, int N>
6844 completion_future copy_async(const array<T, N>& src, const array_view<T, N>& dest) {
6845  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
6846  return completion_future(fut.share());
6847 }
6848 
6858 template <typename T, int N>
6860  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
6861  return completion_future(fut.share());
6862 }
6863 
6864 template <typename T, int N>
6865 completion_future copy_async(const array_view<T, N>& src, array<T, N>& dest) {
6866  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
6867  return completion_future(fut.share());
6868 }
6869 
6881 template <typename T, int N>
6882 completion_future copy_async(const array_view<const T, N>& src, const array_view<T, N>& dest) {
6883  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
6884  return completion_future(fut.share());
6885 }
6886 
6887 template <typename T, int N>
6888 completion_future copy_async(const array_view<T, N>& src, const array_view<T, N>& dest) {
6889  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
6890  return completion_future(fut.share());
6891 }
6892 
6909 template <typename InputIter, typename T, int N>
6910 completion_future copy_async(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest) {
6911  std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]() mutable { copy(srcBegin, srcEnd, dest); });
6912  return completion_future(fut.share());
6913 }
6914 
6915 template <typename InputIter, typename T, int N>
6916 completion_future copy_async(InputIter srcBegin, array<T, N>& dest) {
6917  std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]() mutable { copy(srcBegin, dest); });
6918  return completion_future(fut.share());
6919 }
6920 
6937 template <typename InputIter, typename T, int N>
6938 completion_future copy_async(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest) {
6939  std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]() mutable { copy(srcBegin, srcEnd, dest); });
6940  return completion_future(fut.share());
6941 }
6942 
6943 template <typename InputIter, typename T, int N>
6944 completion_future copy_async(InputIter srcBegin, const array_view<T, N>& dest) {
6945  std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]() mutable { copy(srcBegin, dest); });
6946  return completion_future(fut.share());
6947 }
6948 
6961 template <typename OutputIter, typename T, int N>
6962 completion_future copy_async(const array<T, N>& src, OutputIter destBegin) {
6963  std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]() mutable { copy(src, destBegin); });
6964  return completion_future(fut.share());
6965 }
6966 
6977 template <typename OutputIter, typename T, int N>
6978 completion_future copy_async(const array_view<T, N>& src, OutputIter destBegin) {
6979  std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]() mutable { copy(src, destBegin); });
6980  return completion_future(fut.share());
6981 }
6982 
6983 
6984 // FIXME: consider remove these functions
6985 template <typename T, int N>
6986 completion_future copy_async(const array<T, N>& src, const array<T, N>& dest) {
6987  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
6988  return completion_future(fut.share());
6989 }
6990 
6991 template <typename T, int N>
6992 completion_future copy_async(const array_view<const T, N>& src, const array<T, N>& dest) {
6993  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
6994  return completion_future(fut.share());
6995 }
6996 
6997 template <typename T, int N>
6998 completion_future copy_async(const array_view<T, N>& src, const array<T, N>& dest) {
6999  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
7000  return completion_future(fut.share());
7001 }
7002 
7003 // ------------------------------------------------------------------------
7004 // atomic functions
7005 // ------------------------------------------------------------------------
7006 
7022 #if __KALMAR_ACCELERATOR__ == 1
7023 extern "C" unsigned int atomic_exchange_unsigned(unsigned int *p, unsigned int val) __HC__;
7024 extern "C" int atomic_exchange_int(int *p, int val) __HC__;
7025 extern "C" float atomic_exchange_float(float *p, float val) __HC__;
7026 extern "C" uint64_t atomic_exchange_uint64(uint64_t *p, uint64_t val) __HC__;
7027 
7028 static inline unsigned int atomic_exchange(unsigned int * dest, unsigned int val) __CPU__ __HC__ {
7029  return atomic_exchange_unsigned(dest, val);
7030 }
7031 static inline int atomic_exchange(int * dest, int val) __CPU__ __HC__ {
7032  return atomic_exchange_int(dest, val);
7033 }
7034 static inline float atomic_exchange(float * dest, float val) __CPU__ __HC__ {
7035  return atomic_exchange_float(dest, val);
7036 }
7037 static inline uint64_t atomic_exchange(uint64_t * dest, uint64_t val) __CPU__ __HC__ {
7038  return atomic_exchange_uint64(dest, val);
7039 }
7040 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7041 unsigned int atomic_exchange_unsigned(unsigned int *p, unsigned int val);
7042 int atomic_exchange_int(int *p, int val);
7043 float atomic_exchange_float(float *p, float val);
7044 uint64_t atomic_exchange_uint64(uint64_t *p, uint64_t val);
7045 
7046 static inline unsigned int atomic_exchange(unsigned int *dest, unsigned int val) __CPU__ __HC__ {
7047  return atomic_exchange_unsigned(dest, val);
7048 }
7049 static inline int atomic_exchange(int *dest, int val) __CPU__ __HC__ {
7050  return atomic_exchange_int(dest, val);
7051 }
7052 static inline float atomic_exchange(float *dest, float val) __CPU__ __HC__ {
7053  return atomic_exchange_float(dest, val);
7054 }
7055 static inline uint64_t atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__ {
7056  return atomic_exchange_uint64(dest, val);
7057 }
7058 #else
7059 extern unsigned int atomic_exchange(unsigned int *dest, unsigned int val) __CPU__ __HC__;
7060 extern int atomic_exchange(int *dest, int val) __CPU__ __HC__;
7061 extern float atomic_exchange(float *dest, float val) __CPU__ __HC__;
7062 extern uint64_t atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__;
7063 #endif
7064 
7097 #if __KALMAR_ACCELERATOR__ == 1
7098 extern "C" unsigned int atomic_compare_exchange_unsigned(unsigned int *dest, unsigned int expected_val, unsigned int val) __HC__;
7099 extern "C" int atomic_compare_exchange_int(int *dest, int expected_val, int val) __HC__;
7100 extern "C" uint64_t atomic_compare_exchange_uint64(uint64_t *dest, uint64_t expected_val, uint64_t val) __HC__;
7101 
7102 static inline bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) __CPU__ __HC__ {
7103  *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
7104  return (*dest == val);
7105 }
7106 static inline bool atomic_compare_exchange(int *dest, int *expected_val, int val) __CPU__ __HC__ {
7107  *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
7108  return (*dest == val);
7109 }
7110 static inline bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__ {
7111  *expected_val = atomic_compare_exchange_uint64(dest, *expected_val, val);
7112  return (*dest == val);
7113 }
7114 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7115 unsigned int atomic_compare_exchange_unsigned(unsigned int *dest, unsigned int expected_val, unsigned int val);
7116 int atomic_compare_exchange_int(int *dest, int expected_val, int val);
7117 uint64_t atomic_compare_exchange_uint64(uint64_t *dest, uint64_t expected_val, uint64_t val);
7118 
7119 static inline bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) __CPU__ __HC__ {
7120  *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
7121  return (*dest == val);
7122 }
7123 static inline bool atomic_compare_exchange(int *dest, int *expected_val, int val) __CPU__ __HC__ {
7124  *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
7125  return (*dest == val);
7126 }
7127 static inline bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__ {
7128  *expected_val = atomic_compare_exchange_uint64(dest, *expected_val, val);
7129  return (*dest == val);
7130 }
7131 #else
7132 extern bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) __CPU__ __HC__;
7133 extern bool atomic_compare_exchange(int *dest, int *expected_val, int val) __CPU__ __HC__;
7134 extern bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__;
7135 #endif
7136 
7166 #if __KALMAR_ACCELERATOR__ == 1
7167 extern "C" unsigned int atomic_add_unsigned(unsigned int *p, unsigned int val) __HC__;
7168 extern "C" int atomic_add_int(int *p, int val) __HC__;
7169 extern "C" float atomic_add_float(float *p, float val) __HC__;
7170 extern "C" uint64_t atomic_add_uint64(uint64_t *p, uint64_t val) __HC__;
7171 
7172 static inline unsigned int atomic_fetch_add(unsigned int *x, unsigned int y) __CPU__ __HC__ {
7173  return atomic_add_unsigned(x, y);
7174 }
7175 static inline int atomic_fetch_add(int *x, int y) __CPU__ __HC__ {
7176  return atomic_add_int(x, y);
7177 }
7178 static inline float atomic_fetch_add(float *x, float y) __CPU__ __HC__ {
7179  return atomic_add_float(x, y);
7180 }
7181 static inline uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7182  return atomic_add_uint64(x, y);
7183 }
7184 
7185 extern "C" unsigned int atomic_sub_unsigned(unsigned int *p, unsigned int val) __HC__;
7186 extern "C" int atomic_sub_int(int *p, int val) __HC__;
7187 extern "C" float atomic_sub_float(float *p, float val) __HC__;
7188 
7189 static inline unsigned int atomic_fetch_sub(unsigned int *x, unsigned int y) __CPU__ __HC__ {
7190  return atomic_sub_unsigned(x, y);
7191 }
7192 static inline int atomic_fetch_sub(int *x, int y) __CPU__ __HC__ {
7193  return atomic_sub_int(x, y);
7194 }
7195 static inline int atomic_fetch_sub(float *x, float y) __CPU__ __HC__ {
7196  return atomic_sub_float(x, y);
7197 }
7198 
7199 extern "C" unsigned int atomic_and_unsigned(unsigned int *p, unsigned int val) __HC__;
7200 extern "C" int atomic_and_int(int *p, int val) __HC__;
7201 extern "C" uint64_t atomic_and_uint64(uint64_t *p, uint64_t val) __HC__;
7202 
7203 static inline unsigned int atomic_fetch_and(unsigned int *x, unsigned int y) __CPU__ __HC__ {
7204  return atomic_and_unsigned(x, y);
7205 }
7206 static inline int atomic_fetch_and(int *x, int y) __CPU__ __HC__ {
7207  return atomic_and_int(x, y);
7208 }
7209 static inline uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7210  return atomic_and_uint64(x, y);
7211 }
7212 
7213 extern "C" unsigned int atomic_or_unsigned(unsigned int *p, unsigned int val) __HC__;
7214 extern "C" int atomic_or_int(int *p, int val) __HC__;
7215 extern "C" uint64_t atomic_or_uint64(uint64_t *p, uint64_t val) __HC__;
7216 
7217 static inline unsigned int atomic_fetch_or(unsigned int *x, unsigned int y) __CPU__ __HC__ {
7218  return atomic_or_unsigned(x, y);
7219 }
7220 static inline int atomic_fetch_or(int *x, int y) __CPU__ __HC__ {
7221  return atomic_or_int(x, y);
7222 }
7223 static inline uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7224  return atomic_or_uint64(x, y);
7225 }
7226 
7227 extern "C" unsigned int atomic_xor_unsigned(unsigned int *p, unsigned int val) __HC__;
7228 extern "C" int atomic_xor_int(int *p, int val) __HC__;
7229 extern "C" uint64_t atomic_xor_uint64(uint64_t *p, uint64_t val) __HC__;
7230 
7231 static inline unsigned int atomic_fetch_xor(unsigned int *x, unsigned int y) __CPU__ __HC__ {
7232  return atomic_xor_unsigned(x, y);
7233 }
7234 static inline int atomic_fetch_xor(int *x, int y) __CPU__ __HC__ {
7235  return atomic_xor_int(x, y);
7236 }
7237 static inline uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7238  return atomic_xor_uint64(x, y);
7239 }
7240 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7241 unsigned int atomic_add_unsigned(unsigned int *p, unsigned int val);
7242 int atomic_add_int(int *p, int val);
7243 float atomic_add_float(float *p, float val);
7244 uint64_t atomic_add_uint64(uint64_t *p, uint64_t val);
7245 
7246 static inline unsigned int atomic_fetch_add(unsigned int *x, unsigned int y) __CPU__ __HC__ {
7247  return atomic_add_unsigned(x, y);
7248 }
7249 static inline int atomic_fetch_add(int *x, int y) __CPU__ __HC__ {
7250  return atomic_add_int(x, y);
7251 }
7252 static inline float atomic_fetch_add(float *x, float y) __CPU__ __HC__ {
7253  return atomic_add_float(x, y);
7254 }
7255 static inline uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7256  return atomic_add_uint64(x, y);
7257 }
7258 
7259 unsigned int atomic_sub_unsigned(unsigned int *p, unsigned int val);
7260 int atomic_sub_int(int *p, int val);
7261 float atomic_sub_float(float *p, float val);
7262 
7263 static inline unsigned int atomic_fetch_sub(unsigned int *x, unsigned int y) __CPU__ __HC__ {
7264  return atomic_sub_unsigned(x, y);
7265 }
7266 static inline int atomic_fetch_sub(int *x, int y) __CPU__ __HC__ {
7267  return atomic_sub_int(x, y);
7268 }
7269 static inline float atomic_fetch_sub(float *x, float y) __CPU__ __HC__ {
7270  return atomic_sub_float(x, y);
7271 }
7272 
7273 unsigned int atomic_and_unsigned(unsigned int *p, unsigned int val);
7274 int atomic_and_int(int *p, int val);
7275 uint64_t atomic_and_uint64(uint64_t *p, uint64_t val);
7276 
7277 static inline unsigned int atomic_fetch_and(unsigned int *x, unsigned int y) __CPU__ __HC__ {
7278  return atomic_and_unsigned(x, y);
7279 }
7280 static inline int atomic_fetch_and(int *x, int y) __CPU__ __HC__ {
7281  return atomic_and_int(x, y);
7282 }
7283 static inline uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7284  return atomic_and_uint64(x, y);
7285 }
7286 
7287 unsigned int atomic_or_unsigned(unsigned int *p, unsigned int val);
7288 int atomic_or_int(int *p, int val);
7289 uint64_t atomic_or_uint64(uint64_t *p, uint64_t val);
7290 
7291 static inline unsigned int atomic_fetch_or(unsigned int *x, unsigned int y) __CPU__ __HC__ {
7292  return atomic_or_unsigned(x, y);
7293 }
7294 static inline int atomic_fetch_or(int *x, int y) __CPU__ __HC__ {
7295  return atomic_or_int(x, y);
7296 }
7297 static inline uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7298  return atomic_or_uint64(x, y);
7299 }
7300 
7301 unsigned int atomic_xor_unsigned(unsigned int *p, unsigned int val);
7302 int atomic_xor_int(int *p, int val);
7303 uint64_t atomic_xor_uint64(uint64_t *p, uint64_t val);
7304 
7305 static inline unsigned int atomic_fetch_xor(unsigned int *x, unsigned int y) __CPU__ __HC__ {
7306  return atomic_xor_unsigned(x, y);
7307 }
7308 static inline int atomic_fetch_xor(int *x, int y) __CPU__ __HC__ {
7309  return atomic_xor_int(x, y);
7310 }
7311 static inline uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7312  return atomic_xor_uint64(x, y);
7313 }
7314 #else
7315 extern unsigned atomic_fetch_add(unsigned *x, unsigned y) __CPU__ __HC__;
7316 extern int atomic_fetch_add(int *x, int y) __CPU__ __HC__;
7317 extern float atomic_fetch_add(float *x, float y) __CPU__ __HC__;
7318 extern uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__;
7319 
7320 extern unsigned atomic_fetch_sub(unsigned *x, unsigned y) __CPU__ __HC__;
7321 extern int atomic_fetch_sub(int *x, int y) __CPU__ __HC__;
7322 extern float atomic_fetch_sub(float *x, float y) __CPU__ __HC__;
7323 
7324 extern unsigned atomic_fetch_and(unsigned *x, unsigned y) __CPU__ __HC__;
7325 extern int atomic_fetch_and(int *x, int y) __CPU__ __HC__;
7326 extern uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__;
7327 
7328 extern unsigned atomic_fetch_or(unsigned *x, unsigned y) __CPU__ __HC__;
7329 extern int atomic_fetch_or(int *x, int y) __CPU__ __HC__;
7330 extern uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__;
7331 
7332 extern unsigned atomic_fetch_xor(unsigned *x, unsigned y) __CPU__ __HC__;
7333 extern int atomic_fetch_xor(int *x, int y) __CPU__ __HC__;
7334 extern uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__;
7335 #endif
7336 
7337 #if __KALMAR_ACCELERATOR__ == 1
7338 extern "C" unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val) __HC__;
7339 extern "C" int atomic_max_int(int *p, int val) __HC__;
7340 extern "C" uint64_t atomic_max_uint64(uint64_t *p, uint64_t val) __HC__;
7341 
7342 static inline unsigned int atomic_fetch_max(unsigned int *x, unsigned int y) __HC__ {
7343  return atomic_max_unsigned(x, y);
7344 }
7345 static inline int atomic_fetch_max(int *x, int y) __HC__ {
7346  return atomic_max_int(x, y);
7347 }
7348 static inline uint64_t atomic_fetch_max(uint64_t *x, uint64_t y) __HC__ {
7349  return atomic_max_uint64(x, y);
7350 }
7351 
7352 extern "C" unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val) __HC__;
7353 extern "C" int atomic_min_int(int *p, int val) __HC__;
7354 extern "C" uint64_t atomic_min_uint64(uint64_t *p, uint64_t val) __HC__;
7355 
7356 static inline unsigned int atomic_fetch_min(unsigned int *x, unsigned int y) __HC__ {
7357  return atomic_min_unsigned(x, y);
7358 }
7359 static inline int atomic_fetch_min(int *x, int y) __HC__ {
7360  return atomic_min_int(x, y);
7361 }
7362 static inline uint64_t atomic_fetch_min(uint64_t *x, uint64_t y) __HC__ {
7363  return atomic_min_uint64(x, y);
7364 }
7365 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7366 unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val);
7367 int atomic_max_int(int *p, int val);
7368 uint64_t atomic_max_uint64(uint64_t *p, uint64_t val);
7369 
7370 static inline unsigned int atomic_fetch_max(unsigned int *x, unsigned int y) __HC__ {
7371  return atomic_max_unsigned(x, y);
7372 }
7373 static inline int atomic_fetch_max(int *x, int y) __HC__ {
7374  return atomic_max_int(x, y);
7375 }
7376 static inline uint64_t atomic_fetch_max(uint64_t *x, uint64_t y) __HC__ {
7377  return atomic_max_uint64(x, y);
7378 }
7379 
7380 unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val);
7381 int atomic_min_int(int *p, int val);
7382 uint64_t atomic_min_uint64(uint64_t *p, uint64_t val);
7383 
7384 static inline unsigned int atomic_fetch_min(unsigned int *x, unsigned int y) __HC__ {
7385  return atomic_min_unsigned(x, y);
7386 }
7387 static inline int atomic_fetch_min(int *x, int y) __HC__ {
7388  return atomic_min_int(x, y);
7389 }
7390 static inline uint64_t atomic_fetch_min(uint64_t *x, uint64_t y) __HC__ {
7391  return atomic_min_uint64(x, y);
7392 }
7393 #else
7394 extern int atomic_fetch_max(int * dest, int val) __CPU__ __HC__;
7395 extern unsigned int atomic_fetch_max(unsigned int * dest, unsigned int val) __CPU__ __HC__;
7396 extern uint64_t atomic_fetch_max(uint64_t * dest, uint64_t val) __CPU__ __HC__;
7397 
7398 extern int atomic_fetch_min(int * dest, int val) __CPU__ __HC__;
7399 extern unsigned int atomic_fetch_min(unsigned int * dest, unsigned int val) __CPU__ __HC__;
7400 extern uint64_t atomic_fetch_min(uint64_t * dest, uint64_t val) __CPU__ __HC__;
7401 #endif
7402 
7418 #if __KALMAR_ACCELERATOR__ == 1
7419 extern "C" unsigned int atomic_inc_unsigned(unsigned int *p) __HC__;
7420 extern "C" int atomic_inc_int(int *p) __HC__;
7421 
7422 static inline unsigned int atomic_fetch_inc(unsigned int *x) __CPU__ __HC__ {
7423  return atomic_inc_unsigned(x);
7424 }
7425 static inline int atomic_fetch_inc(int *x) __CPU__ __HC__ {
7426  return atomic_inc_int(x);
7427 }
7428 
7429 extern "C" unsigned int atomic_dec_unsigned(unsigned int *p) __HC__;
7430 extern "C" int atomic_dec_int(int *p) __HC__;
7431 
7432 static inline unsigned int atomic_fetch_dec(unsigned int *x) __CPU__ __HC__ {
7433  return atomic_dec_unsigned(x);
7434 }
7435 static inline int atomic_fetch_dec(int *x) __CPU__ __HC__ {
7436  return atomic_dec_int(x);
7437 }
7438 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7439 unsigned int atomic_inc_unsigned(unsigned int *p);
7440 int atomic_inc_int(int *p);
7441 
7442 static inline unsigned int atomic_fetch_inc(unsigned int *x) __CPU__ __HC__ {
7443  return atomic_inc_unsigned(x);
7444 }
7445 static inline int atomic_fetch_inc(int *x) __CPU__ __HC__ {
7446  return atomic_inc_int(x);
7447 }
7448 
7449 unsigned int atomic_dec_unsigned(unsigned int *p);
7450 int atomic_dec_int(int *p);
7451 
7452 static inline unsigned int atomic_fetch_dec(unsigned int *x) __CPU__ __HC__ {
7453  return atomic_dec_unsigned(x);
7454 }
7455 static inline int atomic_fetch_dec(int *x) __CPU__ __HC__ {
7456  return atomic_dec_int(x);
7457 }
7458 #else
7459 extern int atomic_fetch_inc(int * _Dest) __CPU__ __HC__;
7460 extern unsigned int atomic_fetch_inc(unsigned int * _Dest) __CPU__ __HC__;
7461 
7462 extern int atomic_fetch_dec(int * _Dest) __CPU__ __HC__;
7463 extern unsigned int atomic_fetch_dec(unsigned int * _Dest) __CPU__ __HC__;
7464 #endif
7465 
7478 extern "C" unsigned int __atomic_wrapinc(unsigned int* address, unsigned int val) __HC__;
7479 
7490 extern "C" unsigned int __atomic_wrapdec(unsigned int* address, unsigned int val) __HC__;
7491 
7492 
7493 // ------------------------------------------------------------------------
7494 // parallel_for_each
7495 // ------------------------------------------------------------------------
7496 
7497 template <int N, typename Kernel>
7498 completion_future parallel_for_each(const accelerator_view&, const extent<N>&, const Kernel&);
7499 
7500 template <typename Kernel>
7501 completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&);
7502 
7503 template <typename Kernel>
7504 completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&);
7505 
7506 template <typename Kernel>
7507 completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&);
7508 
7509 template <int N, typename Kernel>
7510 completion_future parallel_for_each(const extent<N>& compute_domain, const Kernel& f) {
7511  return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
7512 }
7513 
7514 template <typename Kernel>
7515 completion_future parallel_for_each(const tiled_extent<3>& compute_domain, const Kernel& f) {
7516  return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
7517 }
7518 
7519 template <typename Kernel>
7520 completion_future parallel_for_each(const tiled_extent<2>& compute_domain, const Kernel& f) {
7521  return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
7522 }
7523 
7524 template <typename Kernel>
7525 completion_future parallel_for_each(const tiled_extent<1>& compute_domain, const Kernel& f) {
7526  return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
7527 }
7528 
7529 template <int N, typename Kernel, typename _Tp>
7531 {
7532  static inline void call(Kernel& k, _Tp& idx) __CPU__ __HC__ {
7533  int i;
7534  for (i = 0; i < k.ext[N - 1]; ++i) {
7535  idx[N - 1] = i;
7537  }
7538  }
7539 };
7540 template <typename Kernel, typename _Tp>
7541 struct pfe_helper<0, Kernel, _Tp>
7542 {
7543  static inline void call(Kernel& k, _Tp& idx) __CPU__ __HC__ {
7544 #if __KALMAR_ACCELERATOR__ == 1
7545  k.k(idx);
7546 #endif
7547  }
7548 };
7549 
7550 template <int N, typename Kernel>
7552 {
7553 public:
7554  explicit pfe_wrapper(const extent<N>& other, const Kernel& f) __CPU__ __HC__
7555  : ext(other), k(f) {}
7556  void operator() (index<N> idx) __CPU__ __HC__ {
7557  pfe_helper<N - 3, pfe_wrapper<N, Kernel>, index<N>>::call(*this, idx);
7558  }
7559 private:
7560  const extent<N> ext;
7561  const Kernel k;
7562  template <int K, typename Ker, typename _Tp>
7563  friend struct pfe_helper;
7564 };
7565 
7566 #pragma clang diagnostic push
7567 #pragma clang diagnostic ignored "-Wreturn-type"
7568 #pragma clang diagnostic ignored "-Wunused-variable"
7569 //ND parallel_for_each, nontiled
7570 template <int N, typename Kernel>
7571 __attribute__((noinline,used)) completion_future parallel_for_each(
7572  const accelerator_view& av,
7573  const extent<N>& compute_domain, const Kernel& f) __CPU__ __HC__ {
7574 #if __KALMAR_ACCELERATOR__ != 1
7575  for(int i = 0 ; i < N ; i++)
7576  {
7577  // silently return in case the any dimension of the extent is 0
7578  if (compute_domain[i] == 0)
7579  return completion_future();
7580  if (compute_domain[i] < 0)
7581  throw invalid_compute_domain("Extent is less than 0.");
7582  if (static_cast<size_t>(compute_domain[i]) > 4294967295L)
7583  throw invalid_compute_domain("Extent size too large.");
7584  }
7585  size_t ext[3] = {static_cast<size_t>(compute_domain[N - 1]),
7586  static_cast<size_t>(compute_domain[N - 2]),
7587  static_cast<size_t>(compute_domain[N - 3])};
7588 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7589  if (is_cpu()) {
7590  return launch_cpu_task_async(av.pQueue, f, compute_domain);
7591  }
7592 #endif
7593  if (av.get_accelerator().get_device_path() == L"cpu") {
7594  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
7595  }
7596  const pfe_wrapper<N, Kernel> _pf(compute_domain, f);
7597  return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<pfe_wrapper<N, Kernel>, 3>(av.pQueue, ext, NULL, _pf));
7598 #else
7599 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7600  int* foo1 = reinterpret_cast<int*>(&Kernel::__cxxamp_trampoline);
7601 #endif
7602  auto bar = &pfe_wrapper<N, Kernel>::operator();
7604  int* foo = reinterpret_cast<int*>(&pfe_wrapper<N, Kernel>::__cxxamp_trampoline);
7605 #endif
7606 }
7607 #pragma clang diagnostic pop
7608 
7609 #pragma clang diagnostic push
7610 #pragma clang diagnostic ignored "-Wreturn-type"
7611 #pragma clang diagnostic ignored "-Wunused-variable"
7612 //1D parallel_for_each, nontiled
7613 template <typename Kernel>
7614 __attribute__((noinline,used)) completion_future parallel_for_each(
7615  const accelerator_view& av, const extent<1>& compute_domain, const Kernel& f) __CPU__ __HC__ {
7616 #if __KALMAR_ACCELERATOR__ != 1
7617  // silently return in case the any dimension of the extent is 0
7618  if (compute_domain[0] == 0)
7619  return completion_future();
7620  if (compute_domain[0] < 0) {
7621  throw invalid_compute_domain("Extent is less than 0.");
7622  }
7623  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
7624  throw invalid_compute_domain("Extent size too large.");
7625 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7626  if (is_cpu()) {
7627  return launch_cpu_task_async(av.pQueue, f, compute_domain);
7628  }
7629 #endif
7630  size_t ext = compute_domain[0];
7631  if (av.get_accelerator().get_device_path() == L"cpu") {
7632  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
7633  }
7634  return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 1>(av.pQueue, &ext, NULL, f));
7635 #else //if __KALMAR_ACCELERATOR__ != 1
7636  //to ensure functor has right operator() defined
7637  //this triggers the trampoline code being emitted
7638  auto foo = &Kernel::__cxxamp_trampoline;
7639  auto bar = &Kernel::operator();
7640 #endif
7641 }
7642 #pragma clang diagnostic pop
7643 
7644 #pragma clang diagnostic push
7645 #pragma clang diagnostic ignored "-Wreturn-type"
7646 #pragma clang diagnostic ignored "-Wunused-variable"
7647 //2D parallel_for_each, nontiled
7648 template <typename Kernel>
7649 __attribute__((noinline,used)) completion_future parallel_for_each(
7650  const accelerator_view& av, const extent<2>& compute_domain, const Kernel& f) __CPU__ __HC__ {
7651 #if __KALMAR_ACCELERATOR__ != 1
7652  // silently return in case the any dimension of the extent is 0
7653  if (compute_domain[0] == 0 || compute_domain[1] == 0)
7654  return completion_future();
7655  if (compute_domain[0] < 0 || compute_domain[1] < 0) {
7656  throw invalid_compute_domain("Extent is less than 0.");
7657  }
7658  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
7659  throw invalid_compute_domain("Extent size too large.");
7660  if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
7661  throw invalid_compute_domain("Extent size too large.");
7662 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7663  if (is_cpu()) {
7664  return launch_cpu_task_async(av.pQueue, f, compute_domain);
7665  }
7666 #endif
7667  size_t ext[2] = {static_cast<size_t>(compute_domain[1]),
7668  static_cast<size_t>(compute_domain[0])};
7669  if (av.get_accelerator().get_device_path() == L"cpu") {
7670  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
7671  }
7672  return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 2>(av.pQueue, ext, NULL, f));
7673 #else //if __KALMAR_ACCELERATOR__ != 1
7674  //to ensure functor has right operator() defined
7675  //this triggers the trampoline code being emitted
7676  auto foo = &Kernel::__cxxamp_trampoline;
7677  auto bar = &Kernel::operator();
7678 #endif
7679 }
7680 #pragma clang diagnostic pop
7681 
7682 #pragma clang diagnostic push
7683 #pragma clang diagnostic ignored "-Wreturn-type"
7684 #pragma clang diagnostic ignored "-Wunused-variable"
7685 //3D parallel_for_each, nontiled
7686 template <typename Kernel>
7687 __attribute__((noinline,used)) completion_future parallel_for_each(
7688  const accelerator_view& av, const extent<3>& compute_domain, const Kernel& f) __CPU__ __HC__ {
7689 #if __KALMAR_ACCELERATOR__ != 1
7690  // silently return in case the any dimension of the extent is 0
7691  if (compute_domain[0] == 0 || compute_domain[1] == 0 || compute_domain[2] == 0)
7692  return completion_future();
7693  if (compute_domain[0] < 0 || compute_domain[1] < 0 || compute_domain[2] < 0) {
7694  throw invalid_compute_domain("Extent is less than 0.");
7695  }
7696  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
7697  throw invalid_compute_domain("Extent size too large.");
7698  if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
7699  throw invalid_compute_domain("Extent size too large.");
7700  if (static_cast<size_t>(compute_domain[2]) > 4294967295L)
7701  throw invalid_compute_domain("Extent size too large.");
7702 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7703  if (is_cpu()) {
7704  return launch_cpu_task_async(av.pQueue, f, compute_domain);
7705  }
7706 #endif
7707  size_t ext[3] = {static_cast<size_t>(compute_domain[2]),
7708  static_cast<size_t>(compute_domain[1]),
7709  static_cast<size_t>(compute_domain[0])};
7710  if (av.get_accelerator().get_device_path() == L"cpu") {
7711  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
7712  }
7713  return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 3>(av.pQueue, ext, NULL, f));
7714 #else //if __KALMAR_ACCELERATOR__ != 1
7715  //to ensure functor has right operator() defined
7716  //this triggers the trampoline code being emitted
7717  auto foo = &Kernel::__cxxamp_trampoline;
7718  auto bar = &Kernel::operator();
7719 #endif
7720 }
7721 #pragma clang diagnostic pop
7722 
7723 #pragma clang diagnostic push
7724 #pragma clang diagnostic ignored "-Wreturn-type"
7725 #pragma clang diagnostic ignored "-Wunused-variable"
7726 //1D parallel_for_each, tiled
7727 template <typename Kernel>
7728 __attribute__((noinline,used)) completion_future parallel_for_each(
7729  const accelerator_view& av, const tiled_extent<1>& compute_domain, const Kernel& f) __CPU__ __HC__ {
7730 #if __KALMAR_ACCELERATOR__ != 1
7731  // silently return in case the any dimension of the extent is 0
7732  if (compute_domain[0] == 0)
7733  return completion_future();
7734  if (compute_domain[0] < 0) {
7735  throw invalid_compute_domain("Extent is less than 0.");
7736  }
7737  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
7738  throw invalid_compute_domain("Extent size too large.");
7739  size_t ext = compute_domain[0];
7740  size_t tile = compute_domain.tile_dim[0];
7741 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7742  if (is_cpu()) {
7743  return launch_cpu_task_async(av.pQueue, f, compute_domain);
7744  } else
7745 #endif
7746  if (av.get_accelerator().get_device_path() == L"cpu") {
7747  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
7748  }
7749  void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
7750  return completion_future(Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async<Kernel, 1>(av.pQueue, &ext, &tile, f, kernel, compute_domain.get_dynamic_group_segment_size()));
7751 #else //if __KALMAR_ACCELERATOR__ != 1
7752  tiled_index<1> this_is_used_to_instantiate_the_right_index;
7753  //to ensure functor has right operator() defined
7754  //this triggers the trampoline code being emitted
7755  auto foo = &Kernel::__cxxamp_trampoline;
7756  auto bar = &Kernel::operator();
7757 #endif
7758 }
7759 #pragma clang diagnostic pop
7760 
7761 #pragma clang diagnostic push
7762 #pragma clang diagnostic ignored "-Wreturn-type"
7763 #pragma clang diagnostic ignored "-Wunused-variable"
7764 //2D parallel_for_each, tiled
7765 template <typename Kernel>
7766 __attribute__((noinline,used)) completion_future parallel_for_each(
7767  const accelerator_view& av, const tiled_extent<2>& compute_domain, const Kernel& f) __CPU__ __HC__ {
7768 #if __KALMAR_ACCELERATOR__ != 1
7769  // silently return in case the any dimension of the extent is 0
7770  if (compute_domain[0] == 0 || compute_domain[1] == 0)
7771  return completion_future();
7772  if (compute_domain[0] < 0 || compute_domain[1] < 0) {
7773  throw invalid_compute_domain("Extent is less than 0.");
7774  }
7775  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
7776  throw invalid_compute_domain("Extent size too large.");
7777  if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
7778  throw invalid_compute_domain("Extent size too large.");
7779  size_t ext[2] = { static_cast<size_t>(compute_domain[1]),
7780  static_cast<size_t>(compute_domain[0])};
7781  size_t tile[2] = { static_cast<size_t>(compute_domain.tile_dim[1]),
7782  static_cast<size_t>(compute_domain.tile_dim[0]) };
7783 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7784  if (is_cpu()) {
7785  return launch_cpu_task_async(av.pQueue, f, compute_domain);
7786  } else
7787 #endif
7788  if (av.get_accelerator().get_device_path() == L"cpu") {
7789  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
7790  }
7791  void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
7792  return completion_future(Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async<Kernel, 2>(av.pQueue, ext, tile, f, kernel, compute_domain.get_dynamic_group_segment_size()));
7793 #else //if __KALMAR_ACCELERATOR__ != 1
7794  tiled_index<2> this_is_used_to_instantiate_the_right_index;
7795  //to ensure functor has right operator() defined
7796  //this triggers the trampoline code being emitted
7797  auto foo = &Kernel::__cxxamp_trampoline;
7798  auto bar = &Kernel::operator();
7799 #endif
7800 }
7801 #pragma clang diagnostic pop
7802 
7803 #pragma clang diagnostic push
7804 #pragma clang diagnostic ignored "-Wreturn-type"
7805 #pragma clang diagnostic ignored "-Wunused-variable"
7806 //3D parallel_for_each, tiled
7807 template <typename Kernel>
7808 __attribute__((noinline,used)) completion_future parallel_for_each(
7809  const accelerator_view& av, const tiled_extent<3>& compute_domain, const Kernel& f) __CPU__ __HC__ {
7810 #if __KALMAR_ACCELERATOR__ != 1
7811  // silently return in case the any dimension of the extent is 0
7812  if (compute_domain[0] == 0 || compute_domain[1] == 0 || compute_domain[2] == 0)
7813  return completion_future();
7814  if (compute_domain[0] < 0 || compute_domain[1] < 0 || compute_domain[2] < 0) {
7815  throw invalid_compute_domain("Extent is less than 0.");
7816  }
7817  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
7818  throw invalid_compute_domain("Extent size too large.");
7819  if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
7820  throw invalid_compute_domain("Extent size too large.");
7821  if (static_cast<size_t>(compute_domain[2]) > 4294967295L)
7822  throw invalid_compute_domain("Extent size too large.");
7823  size_t ext[3] = { static_cast<size_t>(compute_domain[2]),
7824  static_cast<size_t>(compute_domain[1]),
7825  static_cast<size_t>(compute_domain[0])};
7826  size_t tile[3] = { static_cast<size_t>(compute_domain.tile_dim[2]),
7827  static_cast<size_t>(compute_domain.tile_dim[1]),
7828  static_cast<size_t>(compute_domain.tile_dim[0]) };
7829 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
7830  if (is_cpu()) {
7831  return launch_cpu_task_async(av.pQueue, f, compute_domain);
7832  } else
7833 #endif
7834  if (av.get_accelerator().get_device_path() == L"cpu") {
7835  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
7836  }
7837  void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
7838  return completion_future(Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async<Kernel, 3>(av.pQueue, ext, tile, f, kernel, compute_domain.get_dynamic_group_segment_size()));
7839 #else //if __KALMAR_ACCELERATOR__ != 1
7840  tiled_index<3> this_is_used_to_instantiate_the_right_index;
7841  //to ensure functor has right operator() defined
7842  //this triggers the trampoline code being emitted
7843  auto foo = &Kernel::__cxxamp_trampoline;
7844  auto bar = &Kernel::operator();
7845 #endif
7846 }
7847 #pragma clang diagnostic pop
7848 
7849 } // namespace hc
unsigned int __sad_u32_u16x2(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Computes the sum of the absolute differences of src0 and src1 and then adds src2 to the result...
uint64_t __pack_u8x8_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
The partial specialization array_view<const T,N> represents a view over elements of type const T with...
Definition: hc.hpp:5762
array_view(const extent< N > &ext)
Constructs an array_view which is not bound to a data source.
Definition: hc.hpp:5211
bool get_is_debug() const
Returns a boolean value indicating whether the accelerator_view supports debugging through extensive ...
Definition: hc.hpp:197
This class is the return type of all asynchronous APIs and has an interface analogous to std::shared_...
Definition: hc.hpp:1130
Definition: hc.hpp:4064
array(int e0, int e1, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4387
uint64_t __unpackhi_u8x8(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
void * get_hsa_agent()
Returns an opaque handle which points to the underlying HSA agent.
Definition: hc.hpp:478
void flush()
Sends the queued up commands in the accelerator_view to the device for execution. ...
Definition: hc.hpp:241
array_view(const extent< N > &ext, value_type *src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: hc.hpp:5194
extent(const extent &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:1604
array_view< const T, N > section(const extent< N > &ext) const __CPU__ __HC__
Equivalent to "section(index<N>(), ext)".
Definition: hc.hpp:6205
int64_t __unpacklo_s8x8(int64_t src0, int64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
const T value_type
The element type of this array.
Definition: hc.hpp:5781
int64_t __bitextract_s64(int64_t src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits.
tiled_extent(const extent< 1 > &ext, int t0) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2087
array_view< const ElementType, N > reinterpret_as() const __CPU__ __HC__
This member function is similar to "array<T,N>::reinterpret_as", although it only supports array_view...
Definition: hc.hpp:6249
unsigned int __lerp_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Do linear interpolation and computes the unsigned 8-bit average of packed data.
const T & operator()(const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array_view that is at the location in N-dimensional ...
Definition: hc.hpp:6095
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4649
void set_dynamic_group_segment_size(unsigned int size) __CPU__
Set the size of dynamic group segment.
Definition: hc.hpp:2200
const index< 2 > tile_dim
An index of rank 1, 2, 3 that represents the size of the tile.
Definition: hc.hpp:3689
float __amdgcn_wave_sl1(float src, bool bound_ctrl)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2943
array & operator=(array &&other)
Moves the contents of the array "other" to this array.
Definition: hc.hpp:4705
array(int e0, int e1)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:4276
array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4643
bool get_is_auto_selection()
Returns a boolean value indicating whether the accelerator view when passed to a parallel_for_each wo...
Definition: hc.hpp:171
void * get_hsa_am_finegrained_system_region()
Returns an opaque handle which points to the AM system region on the HSA agent.
Definition: hc.hpp:515
array_view(int e0, Container &src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5225
tiled_extent(const extent< 2 > &ext, int t0, int t1, int size) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2192
const index< 3 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: hc.hpp:3468
bool operator==(const extent &other) const __CPU__ __HC__
Compares two objects of extent<N>.
Definition: hc.hpp:1738
tiled_index(const tiled_index &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:3456
std::vector< accelerator > get_peers() const
Return a std::vector of this accelerator&#39;s peers.
Definition: hc.hpp:1070
array(const array_view< const T, N > &src, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array initialized with the contents of the array_view "src".
Definition: hc.hpp:4500
Represents an extent subdivided into tiles.
Definition: hc.hpp:2218
uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
void * get_hsa_am_system_region()
Returns an opaque handle which points to the AM system region on the HSA agent.
Definition: hc.hpp:503
array_view & operator=(const array_view &other) __CPU__ __HC__
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: hc.hpp:5299
array(const extent< N > &ext, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array with the supplied extent, located on the accelerator bound to the accelerator_...
Definition: hc.hpp:4374
unsigned int get_dynamic_group_segment_size() const __CPU__
Return the size of dynamic group segment in bytes.
Definition: hc.hpp:2309
uint64_t get_begin_tick()
Get the tick number when the underlying asynchronous operation begins.
Definition: hc.hpp:1316
size_t get_dedicated_memory() const
Returns the amount of dedicated memory (in KB) on an accelerator device.
Definition: hc.hpp:907
tiled_extent(const tiled_extent &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:2018
float __shfl_up(float var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__
Copy from an active work-item with lower ID relative to caller within a wavefront.
Definition: hc.hpp:3062
array_view< const T, N > section(const index< N > &idx, const extent< N > &ext) const __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:6187
float __unpack_f32_f32x2(double src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
completion_future synchronize_async() const
An asynchronous version of synchronize, which returns a completion future object. ...
Definition: hc.hpp:5425
unsigned int __unpack_u32_u32x2(uint64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
int __unpack_s32_s16x4(int64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
tiled_extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:2141
tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2, int size) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2264
int __unpacklo_s16x2(int src0, int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
bool operator!=(const accelerator &other) const
Compares "this" accelerator with the passed accelerator object to determine if they represent differe...
Definition: hc.hpp:847
array & operator=(const array &other)
Assigns the contents of the array "other" to this array, using a deep copy.
Definition: hc.hpp:4690
uint64_t get_tick_frequency()
Get the frequency of ticks per second for the underlying asynchrnous operation.
Definition: hc.hpp:103
accelerator_view get_associated_accelerator_view() const
This property returns the accelerator_view representing the preferred target where this array can be ...
Definition: hc.hpp:4675
accelerator_view get_source_accelerator_view() const
Access the accelerator_view where the data source of the array_view is located.
Definition: hc.hpp:5908
unsigned int __lastbit_u32_u32(unsigned int input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit.
Definition: hc.hpp:2535
float __amdgcn_wave_rl1(float src)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2982
void wait() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3330
The tile_barrier class is a capability class that is only creatable by the system, and passed to a tiled parallel_for_each function object as part of the tiled_index parameter.
Definition: hc.hpp:3294
index< N > operator/(const index< N > &idx, int value)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:559
array_view< T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:5649
accelerator()
Constructs a new accelerator object that represents the default accelerator.
Definition: hc.hpp:713
array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: hc.hpp:4572
The array_view<T,N> type represents a possibly cached view into the data held in an array<T...
Definition: hc.hpp:60
unsigned int __firstbit_u32_u32(unsigned int input) __HC__
Count leading zero bits in the input.
Definition: hc.hpp:2477
extent & operator*=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1763
int tile_dim[3]
Tile size for each dimension.
Definition: hc.hpp:2231
tiled_index(const tiled_index &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:3655
array(const extent< N > &ext, InputIter srcBegin)
Constructs a new array with the supplied extent, located on the default accelerator, initialized with the contents of a source container specified by a beginning and optional ending iterator.
Definition: hc.hpp:4298
void * get_hsa_agent() const
Returns an opaque handle which points to the underlying HSA agent.
Definition: hc.hpp:1051
uint64_t __pack_u16x4_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4523
bool operator!=(const extent &other) const __CPU__ __HC__
Compares two objects of extent<N>.
Definition: hc.hpp:1741
unsigned int __bitrev_b32(unsigned int src0)[[hc]] __asm("llvm.bitreverse.i32")
Reverse the bits.
void global_memory_fence(const tile_barrier &) __HC__
Establishes a thread-tile scoped memory fence for global (but not tile-static) memory operations...
int64_t __unpackhi_s16x4(int64_t src0, int64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
array_view< T, 2 > section(int i0, int i1, int e0, int e1) const __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:4997
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4326
void * get_hsa_am_finegrained_system_region() const
Returns an opaque handle which points to the AM system region on the HSA agent.
Definition: hc.hpp:1001
T * accelerator_pointer() const __CPU__ __HC__
Returns a pointer to the device memory underlying this array_view.
Definition: hc.hpp:5367
const index< 1 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: hc.hpp:3581
Represents an extent subdivided into tiles.
Definition: hc.hpp:2031
bool get_is_empty()
Returns true if the accelerator_view is currently empty.
Definition: hc.hpp:458
tiled_extent(const tiled_extent< 2 > &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:2173
void synchronize_to(const accelerator_view &av) const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:6056
array_view & operator=(const array_view< T, N > &other) __CPU__ __HC__
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: hc.hpp:5919
void copy_to(const array_view< T, N > &dest) const
Copies the contents of this array to the array_view given by "dest", as if by calling "copy(*this...
Definition: hc.hpp:4753
extent & operator*=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: hc.hpp:1824
const index< 2 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: hc.hpp:3679
extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:1594
array_view(const extent< N > &extent, Container &src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: hc.hpp:5180
Represents a unique position in N-dimensional space.
Definition: hc.hpp:58
uint64_t atomic_fetch_min(uint64_t *dest, uint64_t val) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
void * get_hsa_kernarg_region()
Returns an opaque handle which points to the Kernarg region on the HSA agent.
Definition: hc.hpp:526
extent & operator+=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1755
accelerator & operator=(const accelerator &other)
Assigns an accelerator object to "this" accelerator object and returns a reference to "this" object...
Definition: hc.hpp:802
Definition: hc.hpp:6438
tiled_extent(const extent< 3 > &ext, int t0, int t1, int t2, int size) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2294
array_view< T, N > section(const index< N > &origin, const extent< N > &ext) __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:4930
tiled_extent(const tiled_extent< 1 > &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:2078
Definition: hc.hpp:6342
int __amdgcn_mbcnt_lo(int mask, int src)[[hc]] __asm("llvm.amdgcn.mbcnt.lo")
Direct copy from indexed active work-item within a wavefront.
void synchronize() const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:6028
unsigned int get_version() const
Returns a 32-bit unsigned integer representing the version number of this accelerator.
Definition: hc.hpp:890
int __mul24(int x, int y)[[hc]]
Multiply two integers (x,y) but only the lower 24 bits will be used in the multiplication.
Definition: hc.hpp:3184
bool get_is_peer(const accelerator &other) const
Check if other is peer of this accelerator.
Definition: hc.hpp:1061
T & operator()(int i0, int i1) __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:4852
STL namespace.
int64_t __unpacklo_s32x2(int64_t src0, int64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
float __shfl(float var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:3014
array_view< const T, K > view_as(extent< K > viewExtent) const __CPU__ __HC__
This member function is similar to "array<T,N>::view_as", although it only supports array_views of ra...
Definition: hc.hpp:6272
void wait_with_tile_static_memory_fence() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3382
Definition: hc.hpp:2809
Definition: hc.hpp:7530
Definition: hc.hpp:4075
Definition: kalmar_exception.h:51
const index< 2 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: hc.hpp:3661
const index< 3 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: hc.hpp:3474
Definition: hc.hpp:7551
array_projection_helper< T, N >::result_type operator()(int i0) __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: hc.hpp:4898
unsigned int atomic_fetch_inc(unsigned int *_Dest) __CPU__ __HC__
Atomically increment or decrement the value stored at the location point to by dest.
static bool set_default(const std::wstring &path)
Sets the default accelerator to the device path identified by the "path" argument.
Definition: hc.hpp:770
uint64_t __clock_u64() __HC__
Get system timestamp.
unsigned int __popcount_u32_b32(unsigned int input) __HC__
Count number of 1 bits in the input.
Definition: hc.hpp:2389
void wait_with_all_memory_fence() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3347
uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits.
array_view< const T, N > section(const index< N > &origin, const extent< N > &ext) const __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:4938
array_view< const T, N > section(const index< N > &idx) const __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)".
Definition: hc.hpp:6196
void * get_hsa_kernarg_region() const
Returns an opaque handle which points to the Kernarg region on the HSA agent.
Definition: hc.hpp:1012
void * get_group_segment_base_pointer() __HC__
Fetch the address of the beginning of group segment.
unsigned int get_cu_count() const
Return the compute unit count of the accelerator.
Definition: hc.hpp:1087
array_view< T, N > section(const extent< N > &ext) const __CPU__ __HC__
Equivalent to "section(index<N>(), ext)".
Definition: hc.hpp:5624
unsigned int __sad_u32_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Computes the sum of the absolute differences of src0 and src1 and then adds src2 to the result...
void * get_hsa_am_region() const
Returns an opaque handle which points to the AM region on the HSA agent.
Definition: hc.hpp:977
array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4520
extent & operator+=(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: hc.hpp:1796
void copy_to(array< T, N > &dest) const
Copies the data referred to by this array_view to the array given by "dest", as if by calling "copy(*...
Definition: hc.hpp:5317
T & operator()(const index< N > &idx) __CPU__ __HC__
Returns a reference to the element of this array that is at the location in N-dimensional space speci...
Definition: hc.hpp:4812
namespace for internal classes of Kalmar compiler / runtime
Definition: hc.hpp:42
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: hc.hpp:3586
array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4652
void wait_with_global_memory_fence() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3364
float __amdgcn_ds_swizzle(float src, int pattern)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2896
const index< 3 > tile_dim
An index of rank 1, 2, 3 that represents the size of the tile.
Definition: hc.hpp:3490
T * accelerator_pointer() const __CPU__ __HC__
Returns a pointer to the device memory underlying this array_view.
Definition: hc.hpp:5988
array(int e0, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4385
unsigned int __sad_u32_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Computes the sum of the absolute differences of src0 and src1 and then adds src2 to the result...
int64_t __unpackhi_s8x8(int64_t src0, int64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
std::wstring get_device_path() const
Returns a system-wide unique device instance path that matches the "Device Instance Path" property fo...
Definition: hc.hpp:877
unsigned int size() const __CPU__ __HC__
This member function returns the total linear size of this extent<N> (in units of elements)...
Definition: hc.hpp:1695
void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, const void *args, size_t argsize, hc::completion_future *cf=nullptr, const char *kernel_name=nullptr)
Dispatch a kernel into the accelerator_view.
Definition: hc.hpp:597
array_view(const extent< N > &ext, const value_type *src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: hc.hpp:5830
unsigned int __unpacklo_u16x2(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
array(int e0, int e1, int e2)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:4278
unsigned int __lastbit_u32_u64(unsigned long long int input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit.
Definition: hc.hpp:2539
void * get_dynamic_group_segment_base_pointer() __HC__
Fetch the address of the beginning of dynamic group segment.
void discard_data() const
Indicates to the runtime that it may discard the current logical contents of this array_view...
Definition: hc.hpp:5495
queuing_mode get_queuing_mode() const
Returns the queuing mode that this accelerator_view was created with.
Definition: hc.hpp:152
completion_future synchronize_async() const
An asynchronous version of synchronize, which returns a completion future object. ...
Definition: hc.hpp:6040
tiled_extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:2237
const index< 1 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: hc.hpp:3563
uint64_t __bitselect_b64(uint64_t src0, uint64_t src1, uint64_t src2) __HC__
Do bit field selection.
double __pack_f32x2_f32(double src0, float src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
Definition: hc.hpp:4053
array_view< T, N > section(const index< N > &idx) const __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)".
Definition: hc.hpp:5615
accelerator_view & operator=(const accelerator_view &other)
Assigns an accelerator_view object to "this" accelerator_view object and returns a reference to "this...
Definition: hc.hpp:141
array_view(const array_view &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:5891
int value_type
The element type of extent<N>.
Definition: hc.hpp:1588
int tile_dim[2]
Tile size for each dimension.
Definition: hc.hpp:2135
array(int e0)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:4274
void refresh() const
Calling this member function informs the array_view that its bound memory has been modified outside t...
Definition: hc.hpp:5997
uint64_t atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__
Atomically read the value stored in dest , replace it with the value given in val and return the old ...
bool has_cpu_accessible_am()
Return true if the accelerator&#39;s memory can be mapped into the CPU&#39;s address space, and the CPU is allowed to access the memory directly with CPU memory operations.
Definition: hc.hpp:1106
Definition: hc.hpp:4087
array_view< const T, N > section(const index< N > &idx) const __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)".
Definition: hc.hpp:4957
accelerator get_accelerator() const
Returns the accelerator that this accelerator_view has been created on.
Definition: hc.hpp:1461
extent< N > get_extent() const __CPU__ __HC__
Access the extent that defines the shape of this array.
Definition: hc.hpp:4663
extent & operator%=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: hc.hpp:1832
tiled_extent(int e0, int e1, int t0, int t1) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2152
tiled_extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:2009
Definition: kalmar_exception.h:22
uint64_t atomic_fetch_max(uint64_t *dest, uint64_t val) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
int __amdgcn_mbcnt_hi(int mask, int src)[[hc]] __asm("llvm.amdgcn.mbcnt.hi")
Direct copy from indexed active work-item within a wavefront.
accelerator_view(const accelerator_view &other)
Copy-constructs an accelerator_view object.
Definition: hc.hpp:129
extent & operator-=(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: hc.hpp:1800
array_view< const T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:6230
array_view< const T, K > view_as(const extent< K > &viewExtent) const __CPU__ __HC__
An array of higher rank can be reshaped into an array of lower rank, or vice versa, using the view_as member function.
Definition: hc.hpp:5087
accelerator_view get_source_accelerator_view() const
Access the accelerator_view where the data source of the array_view is located.
Definition: hc.hpp:5289
void wait(hcWaitMode mode=hcWaitModeBlocked) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: hc.hpp:1235
array(const array &other)
Copy constructor.
Definition: hc.hpp:4242
index< N > operator-(const index< N > &lhs, const index< N > &rhs)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:498
unsigned int get_static_group_segment_size() __HC__
Fetch the size of static group segment.
array_view< ElementType, N > reinterpret_as() const __CPU__ __HC__
This member function is similar to "array<T,N>::reinterpret_as", although it only supports array_view...
Definition: hc.hpp:5668
unsigned int get_dynamic_group_segment_size() const __CPU__
Return the size of dynamic group segment in bytes.
Definition: hc.hpp:2207
uint64_t __unpacklo_u8x8(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
hcAgentProfile get_profile() const
Returns the profile the accelerator.
Definition: hc.hpp:1029
tiled_extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:2050
void copy_to(const array_view< T, N > &dest) const
Copies the contents of this array_view to the array_view given by "dest", as if by calling "copy(*thi...
Definition: hc.hpp:5957
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: hc.hpp:3441
completion_future(const completion_future &other)
Copy constructor.
Definition: hc.hpp:1147
array_view< const T, 1 > section(int i0, int e0) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:6220
accelerator_view get_default_view() const
Returns the default accelerator_view associated with the accelerator.
Definition: hc.hpp:813
extent(const int components[]) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided the array of int component values...
Definition: hc.hpp:1636
array_view(array< T, N > &src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" array. ...
Definition: hc.hpp:5160
accelerator(const accelerator &other)
Copy constructs an accelerator object.
Definition: hc.hpp:740
array(int e0, int e1, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: hc.hpp:4570
unsigned int __unpackhi_u8x4(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4532
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: hc.hpp:3485
const T & operator[](const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array that is at the location in N-dimensional space...
Definition: hc.hpp:4829
unsigned int __bitselect_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Do bit field selection.
array(const extent< N > &ext, accelerator_view av, void *accelerator_pointer, access_type cpu_access_type=access_type_auto)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4405
void copy_to(array< T, N > &dest) const
Copies the data referred to by this array_view to the array given by "dest", as if by calling "copy(*...
Definition: hc.hpp:5948
bool contains(const index< N > &idx) const __CPU__ __HC__
Tests whether the index "idx" is properly contained within this extent (with an assumed origin of zer...
Definition: hc.hpp:1686
uint64_t __activelanemask_v4_b64_b1(unsigned int input) __HC__
Return a bit mask shows which active work-items in the wavefront have a non-zero input.
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: hc.hpp:3684
unsigned int __unpack_u32_u8x8(uint64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: hc.hpp:3542
void copy(const void *src, void *dst, size_t size_bytes)
Copies size_bytes bytes from src to dst.
Definition: hc.hpp:343
void copy(const array_view< const T, 1 > &src, const array_view< T, 1 > &dest)
The contents of "src" are copied into "dest".
Definition: hc.hpp:6694
extent operator-(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: hc.hpp:1791
extent operator--(int) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: hc.hpp:1860
const index< 1 > tile_dim
An index of rank 1, 2, 3 that represents the size of the tile.
Definition: hc.hpp:3591
array_view< T, 1 > section(int i0, int e0) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:5639
bool set_default_cpu_access_type(access_type type)
Sets the default_cpu_access_type for this accelerator.
Definition: hc.hpp:867
const T * data() const __CPU__ __HC__
Returns a pointer to the first data element underlying this array_view.
Definition: hc.hpp:5974
completion_future()
Default constructor.
Definition: hc.hpp:1138
const T & operator()(const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array that is at the location in N-dimensional space...
Definition: hc.hpp:4838
std::wstring get_description() const
Returns a short textual description of the accelerator device.
Definition: hc.hpp:882
Definition: kalmar_exception.h:42
T value_type
The element type of this array.
Definition: hc.hpp:5145
T & operator[](const index< N > &idx) __CPU__ __HC__
Returns a reference to the element of this array that is at the location in N-dimensional space speci...
Definition: hc.hpp:4803
unsigned int __bitmask_b32(unsigned int src0, unsigned int src1) __HC__
Create a bit mask that can be used with bitselect.
unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits.
array_view< T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:5005
array_view(int e0, const value_type *src) __CPU__ __HC__
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5866
void * get_native_handle() const
Get the native handle for the asynchronous operation encapsulated in this completion_future object...
Definition: hc.hpp:1302
unsigned int __packcvt_u8x4_f32(float src0, float src1, float src2, float src3) __HC__
Takes four floating-point number, convers them to unsigned integer values, and packs them into a pack...
Definition: kalmar_math.h:691
array_view(const array_view< nc_T, N > &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:5881
array_view(int e0, Container &src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5848
array(const extent< N > &ext)
Constructs a new array with the supplied extent, located on the default view of the default accelerat...
Definition: hc.hpp:4264
int __mad24(int x, int y, int z)[[hc]]
Multiply two integers (x,y) but only the lower 24 bits will be used in the multiplication and then ad...
Definition: hc.hpp:3210
float __unpackcvt_f32_u8x4(unsigned int src0, unsigned int src1) __HC__
Unpacks a single element from a packed u8x4 value and converts it to an f32.
array(int e0, int e1, int e2, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4389
float __amdgcn_ds_permute(int index, float src)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2880
unsigned int __popcount_u32_b64(unsigned long long int input) __HC__
Count number of 1 bits in the input.
Definition: hc.hpp:2399
array(int e0, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4317
array(const array_view< const T, N > &src, accelerator_view av, accelerator_view associated_av)
Constructs a staging array initialized with the array_view given by "src", which acts as a staging ar...
Definition: hc.hpp:4621
Represents an extent subdivided into tiles.
Definition: hc.hpp:2122
extent & operator+=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: hc.hpp:1816
void then(const functor &func)
This method enables specification of a completion callback func which is executed upon completion of ...
Definition: hc.hpp:1279
tiled_extent(const extent< 3 > &ext, int t0, int t1, int t2) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2283
array(int e0, int e1, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: hc.hpp:4425
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4526
int64_t __unpacklo_s16x4(int64_t src0, int64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
Represents a logical (isolated) accelerator view of a compute accelerator.
Definition: hc.hpp:120
int tile_dim[1]
Tile size for each dimension.
Definition: hc.hpp:2044
Represents an N-dimensional region of memory (with type T) located on an accelerator.
Definition: hc.hpp:61
const T & operator()(int i0, int i1) const __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: hc.hpp:4869
tiled_extent(const extent< 1 > &ext, int t0, int size) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2096
uint64_t get_tick_frequency()
Get the frequency of ticks per second for the underlying asynchrnous operation.
Definition: hc.hpp:1344
access_type get_default_cpu_access_type() const
Get the default cpu access_type for buffers created on this accelerator.
Definition: hc.hpp:946
unsigned int __unpackhi_u16x2(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4646
uint64_t get_system_ticks()
Get the current tick count for the GPU platform.
Definition: hc.hpp:93
Definition: hc.hpp:6400
bool operator==(const accelerator_view &other) const
Compares "this" accelerator_view with the passed accelerator_view object to determine if they represe...
Definition: hc.hpp:419
array_view< T, K > view_as(const extent< K > &viewExtent) __CPU__ __HC__
An array of higher rank can be reshaped into an array of lower rank, or vice versa, using the view_as member function.
Definition: hc.hpp:5078
extent< N > get_extent() const __CPU__ __HC__
Access the extent that defines the shape of this array_view.
Definition: hc.hpp:5897
Represents an extent subdivided into tiles.
Definition: hc.hpp:59
array_view< T, N > section(const index< N > &idx) __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)".
Definition: hc.hpp:4949
uint64_t get_end_tick()
Get the tick number when the underlying asynchronous operation ends.
Definition: hc.hpp:1330
array(int e0, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4320
void * get_hsa_am_region()
Returns an opaque handle which points to the AM region on the HSA agent.
Definition: hc.hpp:490
execute_order get_execute_order() const
Returns the execution order of this accelerator_view.
Definition: hc.hpp:157
extent(int e0) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided by .
Definition: hc.hpp:1616
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4655
completion_future(completion_future &&other)
Move constructor.
Definition: hc.hpp:1159
bool is_hsa_accelerator() const
Returns if the accelerator is based on HSA.
Definition: hc.hpp:1019
unsigned int __unpack_u32_u8x4(unsigned int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
int __bitextract_s32(int src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits.
bool get_supports_limited_double_precision() const
Returns a boolean value indicating whether the accelerator has limited double precision support (excl...
Definition: hc.hpp:922
const index< 1 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: hc.hpp:3575
array(const extent< N > &ext, InputIter srcBegin, InputIter srcEnd)
Constructs a new array with the supplied extent, located on the default accelerator, initialized with the contents of a source container specified by a beginning and optional ending iterator.
Definition: hc.hpp:4301
uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
extent(_Tp...__t) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided by .
Definition: hc.hpp:1620
array_view(int e0)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:5258
int __unpacklo_s8x4(int src0, int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2250
accelerator_view create_view(execute_order order=execute_in_order, queuing_mode mode=queuing_mode_automatic)
Creates and returns a new accelerator view on the accelerator with the supplied queuing mode...
Definition: hc.hpp:823
array_view< const T, 1 > section(int i0, int e0) const __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:4993
void refresh() const
Calling this member function informs the array_view that its bound memory has been modified outside t...
Definition: hc.hpp:5376
int __unpack_s32_s8x4(int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
array_projection_helper< T, N >::const_result_type operator()(int i0) const __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: hc.hpp:4906
const index< 1 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: hc.hpp:3569
int get_pending_async_ops()
Returns the number of pending asynchronous operations on this accelerator view.
Definition: hc.hpp:447
int get_use_count() const
Definition: hc.hpp:1381
bool get_has_display() const
This property indicates that the accelerator may be shared by (and thus have interference from) the o...
Definition: hc.hpp:900
unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits.
Definition: hc.hpp:2409
tiled_extent(int e0, int t0) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2059
index< N > operator+(const index< N > &lhs, const index< N > &rhs)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:492
extent(int components[]) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided the array of int component values...
Definition: hc.hpp:1647
std::future_status wait_for(const std::chrono::duration< _Rep, _Period > &_Rel_time) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: hc.hpp:1248
unsigned __pack_u16x2_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
array_view< const T, 2 > section(int i0, int i1, int e0, int e1) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:6225
void wait(hcWaitMode waitMode=hcWaitModeBlocked)
Performs a blocking wait for completion of all commands submitted to the accelerator view prior to ca...
Definition: hc.hpp:208
unsigned int __unpacklo_u8x4(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
accelerator_view get_accelerator_view() const
This property returns the accelerator_view representing the location where this array has been alloca...
Definition: hc.hpp:4669
unsigned int __atomic_wrapinc(unsigned int *address, unsigned int val) __HC__
Atomically do the following operations:
void synchronize_to(const accelerator_view &av) const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:5464
array(const extent< N > &ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator_vi...
Definition: hc.hpp:4596
Represents a unique position in N-dimensional space.
Definition: kalmar_index.h:226
array(const extent< N > &ext, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator vi...
Definition: hc.hpp:4549
uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits.
array_view< const T, N > section(const extent< N > &ext) const __CPU__ __HC__
Equivalent to "section(index<N>(), ext)".
Definition: hc.hpp:4972
int __unpack_s32_s8x8(int64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
size_t get_max_tile_static_size()
Returns the maximum size of tile static area available on this accelerator.
Definition: hc.hpp:953
bool operator!=(const accelerator_view &other) const
Compares "this" accelerator_view with the passed accelerator_view object to determine if they represe...
Definition: hc.hpp:431
array_view(int e0, value_type *src) __CPU__ __HC__
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5244
extent & operator/=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1767
const index< 2 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: hc.hpp:3673
void copy_to(array &dest) const
Copies the contents of this array to the array given by "dest", as if by calling "copy(*this, dest)".
Definition: hc.hpp:4735
extent & operator/=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: hc.hpp:1828
tiled_extent(int e0, int e1, int t0, int t1, int size) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2164
array & operator=(const array_view< T, N > &src)
Assigns the contents of the array_view "src", as if by calling "copy(src, *this)".
Definition: hc.hpp:4722
int __any(int predicate) __HC__
Evaluate predicate for all active work-items in the wavefront and return non-zero if and only if pred...
Definition: hc.hpp:2781
tiled_extent(const extent< 2 > &ext, int t0, int t1) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2182
int64_t __pack_s32x2_s32(int64_t src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
unsigned int __bytealign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Align 32 bits within 64 bis of data on an arbitrary byte boundary.
extent & operator%=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1771
unsigned int __lastbit_u32_s32(int input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit.
Definition: hc.hpp:2543
const T & operator()(int i0, int i1, int i2) const __CPU__ __HC__
Equivalent to "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:6132
unsigned int __firstbit_u32_s32(int input) __HC__
Count leading zero bits in the input.
Definition: hc.hpp:2502
bool is_ready()
Get if the async operations has been completed.
Definition: hc.hpp:1357
access_type get_cpu_access_type() const
This property returns the CPU "access_type" allowed for this array.
Definition: hc.hpp:4680
T & operator()(const index< N > &idx) const __CPU__ __HC__
Returns a reference to the element of this array_view that is at the location in N-dimensional space ...
Definition: hc.hpp:5517
tiled_extent(const tiled_extent< 3 > &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:2273
uint64_t __unpackhi_u32x2(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4332
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: hc.hpp:3640
uint64_t __cycle_u64() __HC__
Get hardware cycle count.
unsigned int get_group_segment_size() __HC__
Fetch the size of group segment.
float __amdgcn_wave_rr1(float src)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2963
uint64_t __bitmask_b64(unsigned int src0, unsigned int src1) __HC__
Create a bit mask that can be used with bitselect.
array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: hc.hpp:4427
uint64_t __pack_u32x2_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
extent & operator--() __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: hc.hpp:1856
completion_future & operator=(completion_future &&_Other)
Move assignment.
Definition: hc.hpp:1188
array(int e0, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: hc.hpp:4568
const T & operator[](const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array_view that is at the location in N-dimensional ...
Definition: hc.hpp:6088
unsigned int __sadhi_u16x2_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
This function is mostly the same as sad except the sum of absolute differences is added to the most s...
tiled_extent(int e0, int t0, int size) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2069
float __amdgcn_ds_bpermute(int index, float src)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2865
bool set_cu_mask(const std::vector< bool > &cu_mask)
Set a CU affinity to specific command queues.
Definition: hc.hpp:618
T * data() const __CPU__ __HC__
Returns a pointer to the raw data underlying this array.
Definition: hc.hpp:4760
T * data() const __CPU__ __HC__
Returns a pointer to the first data element underlying this array_view.
Definition: hc.hpp:5352
int __bitinsert_s32(int src0, int src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits.
uint64_t __ballot(int predicate) __HC__
Evaluate predicate for all active work-items in the wavefront and return an integer whose Nth bit is ...
Definition: hc.hpp:2800
int __pack_s8x4_s32(int src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4640
extent & operator++() __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: hc.hpp:1847
index< N > operator*(const index< N > &idx, int value)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:547
array_view< T, 2 > section(int i0, int i1, int e0, int e1) __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:5001
index< N > operator%(const index< N > &idx, int value)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:571
extent< N > get_extent() const __CPU__ __HC__
Access the extent that defines the shape of this array_view.
Definition: hc.hpp:5278
array(int e0, int e1, int e2, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4329
int64_t __unpackhi_s32x2(int64_t src0, int64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
unsigned int __activelanecount_u32_b1(unsigned int input) __HC__
Count the number of active work-items in the current wavefront that have a non-zero input...
Definition: hc.hpp:2768
array(const array_view< const T, N > &src)
Constructs a new array, located on the default view of the default accelerator, initialized with the ...
Definition: hc.hpp:4348
Definition: hc.hpp:3929
array(const extent< N > &ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator_vi...
Definition: hc.hpp:4593
extent & operator-=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1759
uint64_t __bitrev_b64(uint64_t src0)[[hc]] __asm("llvm.bitreverse.i64")
Reverse the bits.
bool get_supports_cpu_shared_memory() const
Returns a boolean value indicating whether the accelerator supports memory accessible both by the acc...
Definition: hc.hpp:941
uint64_t __unpacklo_u32x2(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
float __amdgcn_wave_sr1(float src, bool bound_ctrl)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2923
int __all(int predicate) __HC__
Evaluate predicate for all active work-items in the wavefront and return non-zero if and only if pred...
Definition: hc.hpp:2790
accelerator(const std::wstring &path)
Constructs a new accelerator object that represents the physical device named by the "path" argument...
Definition: hc.hpp:730
void set_dynamic_group_segment_size(unsigned int size) __CPU__
Set the size of dynamic group segment.
Definition: hc.hpp:2302
bool get_is_emulated() const
Returns a boolean value indicating whether the accelerator is emulated.
Definition: hc.hpp:935
Definition: hc.hpp:6372
bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__
These functions attempt to perform these three steps atomically:
completion_future & operator=(const completion_future &_Other)
Copy assignment.
Definition: hc.hpp:1170
int64_t __pack_s16x4_s32(int64_t src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
extent & operator=(const extent &other) __CPU__ __HC__
Assigns the component values of "other" to this extent<N> object.
Definition: hc.hpp:1657
int __unpack_s32_s16x2(int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
const T & operator()(int i0) const __CPU__ __HC__
Equivalent to "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:6123
array_view< T, N > section(const index< N > &idx, const extent< N > &ext) const __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:5602
array(int e0, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: hc.hpp:4423
int __unpackhi_s16x2(int src0, int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4529
Heterogeneous C++ (HC) namespace.
Definition: grid_launch.h:10
int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__
Copy from an active work-item based on bitwise XOR of caller work-item ID within a wavefront...
Definition: hc.hpp:3142
int __pack_s16x2_s32(int src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
void * get_hsa_am_system_region() const
Returns an opaque handle which points to the AM system region on the HSA agent.
Definition: hc.hpp:989
extent operator++(int) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: hc.hpp:1851
static std::vector< accelerator > get_all()
Returns a std::vector of accelerator objects (in no specific order) representing all accelerators tha...
Definition: hc.hpp:749
bool get_supports_double_precision() const
Returns a Boolean value indicating whether this accelerator supports double-precision (double) comput...
Definition: hc.hpp:914
unsigned int __unpack_u32_u16x4(uint64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
std::vector< accelerator_view > get_all_views()
Returns a vector of all accelerator_view associated with this accelerator.
Definition: hc.hpp:960
array_view< T, 1 > section(int i0, int e0) __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:4989
void copy_to(const array_view &dest) const
Copies the contents of this array_view to the array_view given by "dest", as if by calling "copy(*thi...
Definition: hc.hpp:5335
array_view< T, N > section(const extent< N > &ext) __CPU__ __HC__
Equivalent to "section(index<N>(), ext)".
Definition: hc.hpp:4968
T & operator()(int i0, int i1, int i2) __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:4855
array(const extent< N > &ext, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array with the supplied extent, located on the accelerator bound to the accelerator_...
Definition: hc.hpp:4460
array(int e0, int e1, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4323
bool get_is_debug() const
Returns a boolean value indicating whether the accelerator supports debugging.
Definition: hc.hpp:929
int __unpackhi_s8x4(int src0, int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
const T & operator()(int i0, int i1) const __CPU__ __HC__
Equivalent to "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:6128
uint64_t __unpacklo_u16x4(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
array_view & operator=(const array_view &other) __CPU__ __HC__
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: hc.hpp:5928
void tile_static_memory_fence(const tile_barrier &) __HC__
Establishes a thread-tile scoped memory fence for tile-static (but not global) memory operations...
extent & operator-=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: hc.hpp:1820
size_t get_max_tile_static_size()
Returns the maximum size of tile static area available on this accelerator view.
Definition: hc.hpp:437
unsigned int __lastbit_u32_s64(unsigned long long input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit.
Definition: hc.hpp:2547
uint64_t __unpackhi_u16x4(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
#define __HSA_WAVEFRONT_SIZE__
Fetch the size of a wavefront.
Definition: hc.hpp:2373
uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
int64_t __pack_s8x8_s32(int64_t src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
tile_barrier(const tile_barrier &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:3317
int64_t __bitinsert_s64(int64_t src0, int64_t src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits.
void * get_hsa_queue()
Returns an opaque handle which points to the underlying HSA queue.
Definition: hc.hpp:468
unsigned int __unpack_u32_u16x2(unsigned int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
void set_dynamic_group_segment_size(unsigned int size) __CPU__
Set the size of dynamic group segment.
Definition: hc.hpp:2104
array(const extent< N > &ext, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4392
bool valid() const
This method is functionally identical to std::shared_future<void>::valid.
Definition: hc.hpp:1213
T * accelerator_pointer() const __CPU__ __HC__
Returns a pointer to the device memory underlying this array.
Definition: hc.hpp:4775
array_view(const array_view &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:5272
array_view< const ElementType, 1 > reinterpret_as() const __CPU__ __HC__
Sometimes it is desirable to view the data of an N-dimensional array as a linear array, possibly with a (unsafe) reinterpretation of the element type.
Definition: hc.hpp:5051
unsigned int atomic_fetch_dec(unsigned int *_Dest) __CPU__ __HC__
Atomically increment or decrement the value stored at the location point to by dest.
array(array &&other)
Move constructor.
Definition: hc.hpp:4253
uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
T value_type
The element type of this array.
Definition: hc.hpp:4227
array_view< T, K > view_as(extent< K > viewExtent) const __CPU__ __HC__
This member function is similar to "array<T,N>::view_as", although it only supports array_views of ra...
Definition: hc.hpp:5693
unsigned int __activelaneid_u32() __HC__
Get the count of the number of earlier (in flattened work-item order) active work-items within the sa...
array_view< const T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:5009
unsigned int __firstbit_u32_s64(long long int input) __HC__
Count leading zero bits in the input.
Definition: hc.hpp:2520
array_view< ElementType, 1 > reinterpret_as() __CPU__ __HC__
Sometimes it is desirable to view the data of an N-dimensional array as a linear array, possibly with a (unsafe) reinterpretation of the element type.
Definition: hc.hpp:5038
const index< 2 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: hc.hpp:3667
int __amdgcn_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl)[[hc]]
move DPP intrinsic
unsigned int __bitalign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Align 32 bits within 64 bits of data on an arbitrary bit boundary.
const T & operator()(int i0, int i1, int i2) const __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: hc.hpp:4872
int __unpack_s32_s3x2(int64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
void synchronize() const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:5412
const index< 3 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: hc.hpp:3462
const index< 3 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: hc.hpp:3480
unsigned int get_dynamic_group_segment_size() const __CPU__
Return the size of dynamic group segment in bytes.
Definition: hc.hpp:2111
array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4517
void all_memory_fence(const tile_barrier &) __HC__
Establishes a thread-tile scoped memory fence for both global and tile-static memory operations...
bool is_hsa_accelerator()
Returns if the accelerator view is based on HSA.
Definition: hc.hpp:533
array_view< T, 2 > section(int i0, int i1, int e0, int e1) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:5644
unsigned int __pack_u8x4_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
bool operator==(const accelerator &other) const
Compares "this" accelerator with the passed accelerator object to determine if they represent the sam...
Definition: hc.hpp:837
std::future_status wait_until(const std::chrono::time_point< _Clock, _Duration > &_Abs_time) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: hc.hpp:1253
Definition: kalmar_math.h:297
unsigned int __firstbit_u32_u64(unsigned long long int input) __HC__
Count leading zero bits in the input.
Definition: hc.hpp:2489
Represents a physical accelerated computing device.
Definition: hc.hpp:700
Definition: hc_am.hpp:21
Definition: kalmar_runtime.h:14
int get_seqnum() const
Return the unique integer sequence-number for the accelerator.
Definition: hc.hpp:1095
unsigned int __atomic_wrapdec(unsigned int *address, unsigned int val) __HC__
Atomically do the following operations:
int __lane_id(void)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2845
array_view(const array< T, N > &src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" array. ...
Definition: hc.hpp:5796
float atomic_fetch_sub(float *x, float y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
tiled_index(const tiled_index &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:3557
array_view(const extent< N > &extent, const Container &src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: hc.hpp:5816
float __shfl_down(float var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__
Copy from an active work-item with higher ID relative to caller within a wavefront.
Definition: hc.hpp:3111
extent operator+(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: hc.hpp:1786
static accelerator_view get_auto_selection_view()
Returns an accelerator_view which when passed as the first argument to a parallel_for_each call cause...
Definition: hc.hpp:789