15 #include "hc_defines.h" 16 #include "kalmar_exception.h" 17 #include "kalmar_index.h" 18 #include "kalmar_runtime.h" 19 #include "kalmar_serialize.h" 20 #include "kalmar_launch.h" 21 #include "kalmar_buffer.h" 22 #include "kalmar_math.h" 24 #include "hsa_atomic.h" 25 #include "kalmar_cpu_launch.h" 26 #include "hcc_features.hpp" 29 # define __HC__ [[hc]] 33 # define __CPU__ [[cpu]] 36 typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t;
56 class accelerator_view;
57 class completion_future;
61 template <
typename T,
int N>
class array;
94 return Kalmar::getContext()->getSystemTicks();
104 return Kalmar::getContext()->getSystemTickFrequency();
107 #define GET_SYMBOL_ADDRESS(acc, symbol) \ 108 acc.get_symbol_address( #symbol ); 130 pQueue(other.pQueue) {}
142 pQueue = other.pQueue;
182 unsigned int get_version()
const;
208 void wait(hcWaitMode waitMode = hcWaitModeBlocked) {
209 pQueue->wait(waitMode);
210 Kalmar::getContext()->flushPrintfBuffer();
313 completion_future create_blocking_marker(std::initializer_list<completion_future> dependent_future_list, memory_scope fence_scope=system_scope)
const;
333 template<
typename InputIterator>
334 completion_future create_blocking_marker(InputIterator first, InputIterator last, memory_scope scope)
const;
343 void copy(
const void *src,
void *dst,
size_t size_bytes) {
344 pQueue->copy(src, dst, size_bytes);
364 void copy_ext(
const void *src,
void *dst,
size_t size_bytes, hcCommandKind copyDir,
const hc::AmPointerInfo &srcInfo,
const hc::AmPointerInfo &dstInfo,
bool forceUnpinnedCopy) ;
420 return pQueue == other.pQueue;
438 return pQueue.get()->getDev()->GetMaxTileStaticSize();
448 return pQueue->getPendingAsyncOps();
459 return pQueue->isEmpty();
469 return pQueue->getHSAQueue();
479 return pQueue->getHSAAgent();
491 return pQueue->getHSAAMRegion();
504 return pQueue->getHSAAMHostRegion();
516 return pQueue->getHSACoherentAMHostRegion();
527 return pQueue->getHSAKernargRegion();
534 return pQueue->hasHSAInterOp();
598 const void * args,
size_t argsize,
601 pQueue->dispatch_hsa_kernel(aql, args, argsize, cf, kernel_name);
620 if(is_hsa_accelerator()) {
621 return pQueue->set_cu_mask(cu_mask);
627 accelerator_view(std::shared_ptr<Kalmar::KalmarQueue> pQueue) : pQueue(pQueue) {}
628 std::shared_ptr<Kalmar::KalmarQueue> pQueue;
631 template <
typename Q,
int K>
friend class array;
632 template <
typename Q,
int K>
friend class array_view;
634 template<
typename Kernel>
friend 635 void* Kalmar::mcw_cxxamp_get_kernel(
const std::shared_ptr<Kalmar::KalmarQueue>&,
const Kernel&);
636 template<
typename Kernel,
int dim_ext>
friend 637 void Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory(
const std::shared_ptr<Kalmar::KalmarQueue>&,
size_t *,
size_t *,
const Kernel&,
void*,
size_t);
638 template<
typename Kernel,
int dim_ext>
friend 639 std::shared_ptr<Kalmar::KalmarAsyncOp> Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async(
const std::shared_ptr<Kalmar::KalmarQueue>&,
size_t *,
size_t *,
const Kernel&,
void*,
size_t);
640 template<
typename Kernel,
int dim_ext>
friend 641 void Kalmar::mcw_cxxamp_launch_kernel(
const std::shared_ptr<Kalmar::KalmarQueue>&,
size_t *,
size_t *,
const Kernel&);
642 template<
typename Kernel,
int dim_ext>
friend 643 std::shared_ptr<Kalmar::KalmarAsyncOp> Kalmar::mcw_cxxamp_launch_kernel_async(
const std::shared_ptr<Kalmar::KalmarQueue>&,
size_t *,
size_t *,
const Kernel&);
645 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 646 template <
typename Kernel,
int N>
friend 652 template <
int N,
typename Kernel>
friend 656 template <
typename Kernel>
friend 660 template <
typename Kernel>
friend 664 template <
typename Kernel>
friend 668 template <
typename Kernel>
friend 672 template <
typename Kernel>
friend 676 template <
typename Kernel>
friend 680 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 683 __attribute__((annotate(
"user_deserialize")))
685 #if __KALMAR_ACCELERATOR__ != 1 731 : pDev(
Kalmar::getContext()->getDevice(path)) {}
750 auto Devices = Kalmar::getContext()->getDevices();
751 std::vector<accelerator> ret;
752 for(
auto&& i : Devices)
771 return Kalmar::getContext()->set_default(path);
790 return Kalmar::getContext()->auto_select();
824 auto pQueue = pDev->createQueue(order);
825 pQueue->set_mode(mode);
868 pDev->set_access(type);
954 return get_default_view().get_max_tile_static_size();
961 std::vector<accelerator_view> result;
962 std::vector< std::shared_ptr<Kalmar::KalmarQueue> > queues = pDev->get_all_queues();
963 for (
auto q : queues) {
978 return get_default_view().get_hsa_am_region();
990 return get_default_view().get_hsa_am_system_region();
1002 return get_default_view().get_hsa_am_finegrained_system_region();
1013 return get_default_view().get_hsa_kernarg_region();
1020 return get_default_view().is_hsa_accelerator();
1030 return pDev->getProfile();
1033 void memcpy_symbol(
const char* symbolName,
void* hostptr,
size_t count,
size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {
1034 pDev->memcpySymbol(symbolName, hostptr, count, offset, kind);
1037 void memcpy_symbol(
void* symbolAddr,
void* hostptr,
size_t count,
size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {
1038 pDev->memcpySymbol(symbolAddr, hostptr, count, offset, kind);
1041 void* get_symbol_address(
const char* symbolName) {
1042 return pDev->getSymbolAddress(symbolName);
1052 return pDev->getHSAAgent();
1062 return pDev->is_peer(other.pDev);
1071 std::vector<accelerator> peers;
1073 const auto &accs = get_all();
1075 for(
auto iter = accs.begin(); iter != accs.end(); iter++)
1077 if(this->get_is_peer(*iter))
1078 peers.push_back(*iter);
1088 return pDev->get_compute_unit_count();
1096 return pDev->get_seqnum();
1107 return pDev->has_cpu_accessible_am();
1110 Kalmar::KalmarDevice *get_dev_ptr()
const {
return pDev; };
1113 accelerator(Kalmar::KalmarDevice* pDev) : pDev(pDev) {}
1115 Kalmar::KalmarDevice* pDev;
1148 : __amp_future(other.__amp_future), __thread_then(other.__thread_then), __asyncOp(other.__asyncOp) {}
1160 : __amp_future(
std::move(other.__amp_future)), __thread_then(other.__thread_then), __asyncOp(other.__asyncOp) {}
1171 if (
this != &_Other) {
1172 __amp_future = _Other.__amp_future;
1173 __thread_then = _Other.__thread_then;
1174 __asyncOp = _Other.__asyncOp;
1189 if (
this != &_Other) {
1190 __amp_future = std::move(_Other.__amp_future);
1191 __thread_then = _Other.__thread_then;
1192 __asyncOp = _Other.__asyncOp;
1214 return __amp_future.valid();
1235 void wait(hcWaitMode mode = hcWaitModeBlocked)
const {
1236 if (this->valid()) {
1237 if (__asyncOp !=
nullptr) {
1238 __asyncOp->setWaitMode(mode);
1241 __amp_future.wait();
1244 Kalmar::getContext()->flushPrintfBuffer();
1247 template <
class _Rep,
class _Period>
1248 std::future_status
wait_for(
const std::chrono::duration<_Rep, _Period>& _Rel_time)
const {
1249 return __amp_future.wait_for(_Rel_time);
1252 template <
class _Clock,
class _Duration>
1253 std::future_status
wait_until(
const std::chrono::time_point<_Clock, _Duration>& _Abs_time)
const {
1254 return __amp_future.wait_until(_Abs_time);
1264 operator std::shared_future<void>()
const {
1265 return __amp_future;
1278 template<
typename functor>
1280 #if __KALMAR_ACCELERATOR__ != 1 1282 if (__thread_then ==
nullptr) {
1284 __thread_then =
new std::thread([&]() __CPU__ {
1303 if (__asyncOp !=
nullptr) {
1304 return __asyncOp->getNativeHandle();
1317 if (__asyncOp !=
nullptr) {
1318 return __asyncOp->getBeginTimestamp();
1331 if (__asyncOp !=
nullptr) {
1332 return __asyncOp->getEndTimestamp();
1345 if (__asyncOp !=
nullptr) {
1346 return __asyncOp->getTimestampFrequency();
1358 if (__asyncOp !=
nullptr) {
1359 return __asyncOp->isReady();
1366 if (__thread_then !=
nullptr) {
1367 __thread_then->join();
1369 delete __thread_then;
1370 __thread_then =
nullptr;
1372 if (__asyncOp !=
nullptr) {
1373 __asyncOp =
nullptr;
1384 std::shared_future<void> __amp_future;
1385 std::thread* __thread_then =
nullptr;
1386 std::shared_ptr<Kalmar::KalmarAsyncOp> __asyncOp;
1388 completion_future(std::shared_ptr<Kalmar::KalmarAsyncOp> event) : __amp_future(*(event->getFuture())), __asyncOp(event) {}
1391 : __amp_future(__future), __thread_then(
nullptr), __asyncOp(
nullptr) {}
1393 friend class Kalmar::HSAQueue;
1397 template <
int N,
typename Kernel>
friend 1401 template <
typename Kernel>
friend 1405 template <
typename Kernel>
friend 1409 template <
typename Kernel>
friend 1413 template <
typename Kernel>
friend 1417 template <
typename Kernel>
friend 1421 template <
typename Kernel>
friend 1425 template <
typename T,
int N>
friend 1427 template <
typename T,
int N>
friend 1429 template <
typename T,
int N>
friend 1431 template <
typename T,
int N>
friend 1433 template <
typename T,
int N>
friend 1436 template <
typename InputIter,
typename T,
int N>
friend 1438 template <
typename InputIter,
typename T,
int N>
friend 1440 template <
typename InputIter,
typename T,
int N>
friend 1442 template <
typename InputIter,
typename T,
int N>
friend 1444 template <
typename OutputIter,
typename T,
int N>
friend 1446 template <
typename OutputIter,
typename T,
int N>
friend 1450 template <
typename T,
int N>
friend class array_view;
1461 accelerator_view::get_accelerator()
const {
return pQueue->getDev(); }
1464 accelerator_view::create_marker(memory_scope scope)
const {
1465 std::shared_ptr<Kalmar::KalmarAsyncOp> deps[1];
1468 std::shared_ptr<Kalmar::KalmarAsyncOp> depOp = pQueue->detectStreamDeps(hcCommandMarker,
nullptr);
1472 deps[cnt++] = depOp;
1475 return completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope));
1478 inline unsigned int accelerator_view::get_version()
const {
return get_accelerator().get_version(); }
1481 std::shared_ptr<Kalmar::KalmarAsyncOp> deps[2];
1485 std::shared_ptr<Kalmar::KalmarAsyncOp> depOp = pQueue->detectStreamDeps(hcCommandMarker,
nullptr);
1489 deps[cnt++] = depOp;
1492 if (dependent_future.__asyncOp) {
1493 deps[cnt++] = dependent_future.__asyncOp;
1496 return completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope));
1499 template<
typename InputIterator>
1501 accelerator_view::create_blocking_marker(InputIterator first, InputIterator last, memory_scope scope)
const {
1502 std::shared_ptr<Kalmar::KalmarAsyncOp> deps[5];
1508 std::shared_ptr<Kalmar::KalmarAsyncOp> depOp = pQueue->detectStreamDeps(hcCommandMarker,
nullptr);
1512 deps[cnt++] = depOp;
1520 for (
auto iter = first; iter != last; ++iter) {
1521 if (iter->__asyncOp) {
1522 deps[cnt++] = iter->__asyncOp;
1524 lastMarker =
completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, hc::no_scope));
1531 lastMarker =
completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope));
1538 accelerator_view::create_blocking_marker(std::initializer_list<completion_future> dependent_future_list, memory_scope scope)
const {
1539 return create_blocking_marker(dependent_future_list.begin(), dependent_future_list.end(), scope);
1544 pQueue->copy_ext(src, dst, size_bytes, copyDir, srcInfo, dstInfo, copyAcc ? copyAcc->pDev :
nullptr, forceUnpinnedCopy);
1547 inline void accelerator_view::copy_ext(
const void *src,
void *dst,
size_t size_bytes, hcCommandKind copyDir,
const hc::AmPointerInfo &srcInfo,
const hc::AmPointerInfo &dstInfo,
bool forceHostCopyEngine) {
1548 pQueue->copy_ext(src, dst, size_bytes, copyDir, srcInfo, dstInfo, forceHostCopyEngine);
1552 accelerator_view::copy_async(
const void *src,
void *dst,
size_t size_bytes) {
1557 accelerator_view::copy_async_ext(
const void *src,
void *dst,
size_t size_bytes,
1558 hcCommandKind copyDir,
1562 return completion_future(pQueue->EnqueueAsyncCopyExt(src, dst, size_bytes, copyDir, srcInfo, dstInfo, copyAcc ? copyAcc->pDev :
nullptr));
1583 static const int rank = N;
1595 static_assert(N > 0,
"Dimensionality must be positive");
1605 : base_(other.base_) {}
1619 template <
typename ..._Tp>
1622 static_assert(
sizeof...(__t) <= 3,
"Can only supply at most 3 individual coordinates in the constructor");
1623 static_assert(
sizeof...(__t) == N,
"rank should be consistency");
1636 explicit extent(
const int components[]) __CPU__ __HC__
1637 : base_(components) {}
1647 explicit extent(
int components[]) __CPU__ __HC__
1648 : base_(components) {}
1658 base_.operator=(other.base_);
1669 int operator[] (
unsigned int c)
const __CPU__ __HC__ {
1672 int& operator[] (
unsigned int c) __CPU__ __HC__ {
1687 return Kalmar::amp_helper<N, index<N>,
extent<N>>::contains(idx, *
this);
1695 unsigned int size() const __CPU__ __HC__ {
1696 return Kalmar::index_helper<N, extent<N>>::count_size(*
this);
1723 tiled_extent<2> tile_with_dynamic(
int t0,
int t1,
int dynamic_size)
const;
1724 tiled_extent<3> tile_with_dynamic(
int t0,
int t1,
int t2,
int dynamic_size)
const;
1739 return Kalmar::index_helper<N, extent<N> >::equal(*
this, other);
1742 return !(*
this == other);
1756 base_.operator+=(__r.base_);
1760 base_.operator-=(__r.base_);
1764 base_.operator*=(__r.base_);
1768 base_.operator/=(__r.base_);
1772 base_.operator%=(__r.base_);
1797 base_.operator+=(idx.base_);
1801 base_.operator-=(idx.base_);
1817 base_.operator+=(value);
1821 base_.operator-=(value);
1825 base_.operator*=(value);
1829 base_.operator/=(value);
1833 base_.operator%=(value);
1848 base_.operator+=(1);
1853 base_.operator+=(1);
1857 base_.operator-=(1);
1862 base_.operator-=(1);
1869 typedef Kalmar::index_impl<typename Kalmar::__make_indices<N>::type> base;
1871 template <
int K,
typename Q>
friend struct Kalmar::index_helper;
1872 template <
int K,
typename Q1,
typename Q2>
friend struct Kalmar::amp_helper;
1998 static const int rank = N;
2019 for (
int i = 0; i < N; ++i) {
2020 tile_dim[i] = other.tile_dim[i];
2036 unsigned int dynamic_group_segment_size;
2039 static const int rank = 1;
2069 tiled_extent(
int e0,
int t0,
int size) __CPU__ __HC__ :
extent(e0), dynamic_group_segment_size(size), tile_dim{t0} {}
2105 dynamic_group_segment_size = size;
2112 return dynamic_group_segment_size;
2127 unsigned int dynamic_group_segment_size;
2130 static const int rank = 2;
2152 tiled_extent(
int e0,
int e1,
int t0,
int t1) __CPU__ __HC__ :
extent(e0, e1), dynamic_group_segment_size(0), tile_dim{t0, t1} {}
2164 tiled_extent(
int e0,
int e1,
int t0,
int t1,
int size) __CPU__ __HC__ :
extent(e0, e1), dynamic_group_segment_size(size), tile_dim{t0, t1} {}
2173 tiled_extent(
const tiled_extent<2>& other) __CPU__ __HC__ :
extent(other[0], other[1]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0], other.tile_dim[1]} {}
2201 dynamic_group_segment_size = size;
2208 return dynamic_group_segment_size;
2223 unsigned int dynamic_group_segment_size;
2226 static const int rank = 3;
2250 tiled_extent(
int e0,
int e1,
int e2,
int t0,
int t1,
int t2) __CPU__ __HC__ :
extent(e0, e1, e2), dynamic_group_segment_size(0), tile_dim{t0, t1, t2} {}
2264 tiled_extent(
int e0,
int e1,
int e2,
int t0,
int t1,
int t2,
int size) __CPU__ __HC__ :
extent(e0, e1, e2), dynamic_group_segment_size(size), tile_dim{t0, t1, t2} {}
2273 tiled_extent(
const tiled_extent<3>& other) __CPU__ __HC__ :
extent(other[0], other[1], other[2]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0], other.tile_dim[1], other.tile_dim[2]} {}
2294 tiled_extent(
const extent<3>& ext,
int t0,
int t1,
int t2,
int size) __CPU__ __HC__ :
extent(ext), dynamic_group_segment_size(size), tile_dim{t0, t1, t2} {}
2303 dynamic_group_segment_size = size;
2310 return dynamic_group_segment_size;
2321 static_assert(N == 1,
"One-dimensional tile() method only available on extent<1>");
2328 static_assert(N == 2,
"Two-dimensional tile() method only available on extent<2>");
2335 static_assert(N == 3,
"Three-dimensional tile() method only available on extent<3>");
2346 static_assert(N == 1,
"One-dimensional tile() method only available on extent<1>");
2353 static_assert(N == 2,
"Two-dimensional tile() method only available on extent<2>");
2360 static_assert(N == 3,
"Three-dimensional tile() method only available on extent<3>");
2373 #define __HSA_WAVEFRONT_SIZE__ (64) 2374 extern "C" unsigned int __wavesize() __HC__;
2377 #if __hcc_backend__==HCC_BACKEND_AMDGPU 2378 extern "C" inline unsigned int __wavesize() __HC__ {
2390 return __builtin_popcount(input);
2400 return __builtin_popcountl(input);
2409 extern "C" inline unsigned int __bitextract_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) __HC__ {
2410 return (src0 << (32 - src1 - src2)) >> (32 - src2);
2413 extern "C" uint64_t
__bitextract_u64(uint64_t src0,
unsigned int src1,
unsigned int src2) __HC__;
2415 extern "C" int __bitextract_s32(
int src0,
unsigned int src1,
unsigned int src2) __HC__;
2417 extern "C" int64_t
__bitextract_s64(int64_t src0,
unsigned int src1,
unsigned int src2) __HC__;
2426 extern "C" unsigned int __bitinsert_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2,
unsigned int src3) __HC__;
2428 extern "C" uint64_t
__bitinsert_u64(uint64_t src0, uint64_t src1,
unsigned int src2,
unsigned int src3) __HC__;
2430 extern "C" int __bitinsert_s32(
int src0,
int src1,
unsigned int src2,
unsigned int src3) __HC__;
2432 extern "C" int64_t
__bitinsert_s64(int64_t src0, int64_t src1,
unsigned int src2,
unsigned int src3) __HC__;
2441 extern "C" unsigned int __bitmask_b32(
unsigned int src0,
unsigned int src1) __HC__;
2443 extern "C" uint64_t
__bitmask_b64(
unsigned int src0,
unsigned int src1) __HC__;
2453 unsigned int __bitrev_b32(
unsigned int src0) [[
hc]] __asm(
"llvm.bitreverse.i32");
2455 uint64_t
__bitrev_b64(uint64_t src0) [[
hc]] __asm(
"llvm.bitreverse.i64");
2465 extern "C" unsigned int __bitselect_b32(
unsigned int src0,
unsigned int src1,
unsigned int src2) __HC__;
2467 extern "C" uint64_t
__bitselect_b64(uint64_t src0, uint64_t src1, uint64_t src2) __HC__;
2478 return input == 0 ? -1 : __builtin_clz(input);
2490 return input == 0 ? -1 : __builtin_clzl(input);
2536 return input == 0 ? -1 : __builtin_ctz(input);
2540 return input == 0 ? -1 : __builtin_ctzl(input);
2559 extern "C" unsigned int __unpacklo_u8x4(
unsigned int src0,
unsigned int src1) __HC__;
2561 extern "C" uint64_t
__unpacklo_u8x8(uint64_t src0, uint64_t src1) __HC__;
2563 extern "C" unsigned int __unpacklo_u16x2(
unsigned int src0,
unsigned int src1) __HC__;
2571 extern "C" int64_t
__unpacklo_s8x8(int64_t src0, int64_t src1) __HC__;
2587 extern "C" unsigned int __unpackhi_u8x4(
unsigned int src0,
unsigned int src1) __HC__;
2589 extern "C" uint64_t
__unpackhi_u8x8(uint64_t src0, uint64_t src1) __HC__;
2591 extern "C" unsigned int __unpackhi_u16x2(
unsigned int src0,
unsigned int src1) __HC__;
2599 extern "C" int64_t
__unpackhi_s8x8(int64_t src0, int64_t src1) __HC__;
2615 extern "C" unsigned int __pack_u8x4_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) __HC__;
2617 extern "C" uint64_t
__pack_u8x8_u32(uint64_t src0,
unsigned int src1,
unsigned int src2) __HC__;
2619 extern "C" unsigned __pack_u16x2_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) __HC__;
2621 extern "C" uint64_t
__pack_u16x4_u32(uint64_t src0,
unsigned int src1,
unsigned int src2) __HC__;
2623 extern "C" uint64_t
__pack_u32x2_u32(uint64_t src0,
unsigned int src1,
unsigned int src2) __HC__;
2625 extern "C" int __pack_s8x4_s32(
int src0,
int src1,
unsigned int src2) __HC__;
2627 extern "C" int64_t
__pack_s8x8_s32(int64_t src0,
int src1,
unsigned int src2) __HC__;
2629 extern "C" int __pack_s16x2_s32(
int src0,
int src1,
unsigned int src2) __HC__;
2631 extern "C" int64_t
__pack_s16x4_s32(int64_t src0,
int src1,
unsigned int src2) __HC__;
2633 extern "C" int64_t
__pack_s32x2_s32(int64_t src0,
int src1,
unsigned int src2) __HC__;
2635 extern "C" double __pack_f32x2_f32(
double src0,
float src1,
unsigned int src2) __HC__;
2644 extern "C" unsigned int __unpack_u32_u8x4(
unsigned int src0,
unsigned int src1) __HC__;
2646 extern "C" unsigned int __unpack_u32_u8x8(uint64_t src0,
unsigned int src1) __HC__;
2648 extern "C" unsigned int __unpack_u32_u16x2(
unsigned int src0,
unsigned int src1) __HC__;
2672 extern "C" unsigned int __bitalign_b32(
unsigned int src0,
unsigned int src1,
unsigned int src2) __HC__;
2679 extern "C" unsigned int __bytealign_b32(
unsigned int src0,
unsigned int src1,
unsigned int src2) __HC__;
2687 extern "C" unsigned int __lerp_u8x4(
unsigned int src0,
unsigned int src1,
unsigned int src2) __HC__;
2695 extern "C" unsigned int __packcvt_u8x4_f32(
float src0,
float src1,
float src2,
float src3) __HC__;
2711 extern "C" unsigned int __sad_u32_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) __HC__;
2713 extern "C" unsigned int __sad_u32_u16x2(
unsigned int src0,
unsigned int src1,
unsigned int src2) __HC__;
2715 extern "C" unsigned int __sad_u32_u8x4(
unsigned int src0,
unsigned int src1,
unsigned int src2) __HC__;
2724 extern "C" unsigned int __sadhi_u16x2_u8x4(
unsigned int src0,
unsigned int src1,
unsigned int src2) __HC__;
2781 extern "C" inline int __any(
int predicate) __HC__ {
2790 extern "C" inline int __all(
int predicate) __HC__ {
2800 extern "C" inline uint64_t
__ballot(
int predicate) __HC__ {
2835 #if __hcc_backend__==HCC_BACKEND_AMDGPU 2852 #if __hcc_backend__==HCC_BACKEND_AMDGPU 2861 __u tmp; tmp.u = src;
2866 __u tmp; tmp.f = src;
2876 __u tmp; tmp.u = src;
2881 __u tmp; tmp.f = src;
2892 __u tmp; tmp.u = src;
2897 __u tmp; tmp.f = src;
2907 extern "C" int __amdgcn_move_dpp(
int src,
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl) [[
hc]];
2919 __u tmp; tmp.u = src;
2924 __u tmp; tmp.f = src;
2939 __u tmp; tmp.u = src;
2944 __u tmp; tmp.f = src;
2959 __u tmp; tmp.u = src;
2964 __u tmp; tmp.f = src;
2978 __u tmp; tmp.u = src;
2983 __u tmp; tmp.f = src;
2997 #if __hcc_backend__==HCC_BACKEND_AMDGPU 3001 int index = srcLane + (
self & ~(width-1));
3008 __u tmp; tmp.u = var;
3009 tmp.i =
__shfl(tmp.i, srcLane, width);
3015 __u tmp; tmp.f = var;
3016 tmp.i =
__shfl(tmp.i, srcLane, width);
3045 #if __hcc_backend__==HCC_BACKEND_AMDGPU 3049 int index =
self - delta;
3050 index = (index < (
self & ~(width-1)))?
self:index;
3057 __u tmp; tmp.u = var;
3063 __u tmp; tmp.f = var;
3094 #if __hcc_backend__==HCC_BACKEND_AMDGPU 3098 int index =
self + delta;
3099 index = (int)((
self&(width-1))+delta) >= width?
self:index;
3106 __u tmp; tmp.u = var;
3112 __u tmp; tmp.f = var;
3139 #if __hcc_backend__==HCC_BACKEND_AMDGPU 3144 int index =
self^laneMask;
3145 index = index >= ((
self+width)&~(width-1))?
self:index;
3152 __u tmp; tmp.f = var;
3161 __u tmp; tmp.u = var;
3173 inline unsigned int __mul24(
unsigned int x,
unsigned int y) [[
hc]] {
3174 return (x & 0x00FFFFFF) * (y & 0x00FFFFFF);
3185 return ((x << 8) >> 8) * ((y << 8) >> 8);
3197 inline unsigned int __mad24(
unsigned int x,
unsigned int y,
unsigned int z) [[
hc]] {
3214 inline void abort() __HC__ {
3252 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3253 template <
typename Ker,
typename Ti>
3254 void bar_wrapper(Ker *f, Ti *t)
3260 std::unique_ptr<ucontext_t[]> ctx;
3263 ctx(
new ucontext_t[a + 1]) {}
3264 template <
typename Ti,
typename Ker>
3265 void setctx(
int x,
char *stack, Ker& f, Ti* tidx,
int S) {
3266 getcontext(&ctx[x]);
3267 ctx[x].uc_stack.ss_sp = stack;
3268 ctx[x].uc_stack.ss_size = S;
3269 ctx[x].uc_link = &ctx[x - 1];
3270 makecontext(&ctx[x], (
void (*)(
void))bar_wrapper<Ker, Ti>, 2, &f, tidx);
3272 void swap(
int a,
int b) {
3273 swapcontext(&ctx[a], &ctx[b]);
3275 void wait() __HC__ {
3277 swapcontext(&ctx[idx + 1], &ctx[idx]);
3296 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3297 using pb_t = std::shared_ptr<barrier_t>;
3331 #if __KALMAR_ACCELERATOR__ == 1 3332 wait_with_all_memory_fence();
3333 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3348 #if __KALMAR_ACCELERATOR__ == 1 3349 amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
3350 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3365 #if __KALMAR_ACCELERATOR__ == 1 3366 amp_barrier(CLK_GLOBAL_MEM_FENCE);
3367 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3383 #if __KALMAR_ACCELERATOR__ == 1 3384 amp_barrier(CLK_LOCAL_MEM_FENCE);
3385 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3391 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3398 template <
int N>
friend 3447 static const int rank = 3;
3456 tiled_index(
const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}
3504 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3505 __attribute__((always_inline))
tiled_index(
int a0,
int a1,
int a2,
int b0,
int b1,
int b2,
int c0,
int c1,
int c2,
tile_barrier& pb,
int D0,
int D1,
int D2) __CPU__ __HC__
3506 : global(a2, a1, a0), local(b2, b1, b0), tile(c2, c1, c0), tile_origin(a2 - b2, a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1, D2) {}
3509 __attribute__((annotate(
"__cxxamp_opencl_index")))
3510 #if __KALMAR_ACCELERATOR__ == 1 3511 __attribute__((always_inline))
tiled_index() __HC__
3512 : global(
index<3>(amp_get_global_id(2), amp_get_global_id(1), amp_get_global_id(0))),
3513 local(
index<3>(amp_get_local_id(2), amp_get_local_id(1), amp_get_local_id(0))),
3514 tile(
index<3>(amp_get_group_id(2), amp_get_group_id(1), amp_get_group_id(0))),
3515 tile_origin(
index<3>(amp_get_global_id(2) - amp_get_local_id(2),
3516 amp_get_global_id(1) - amp_get_local_id(1),
3517 amp_get_global_id(0) - amp_get_local_id(0))),
3518 tile_dim(
index<3>(amp_get_local_size(2), amp_get_local_size(1), amp_get_local_size(0)))
3519 #elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3520 __attribute__((always_inline))
tiled_index() __CPU__ __HC__
3522 __attribute__((always_inline))
tiled_index() __HC__
3523 #endif // __KALMAR_ACCELERATOR__ 3526 template<
typename Kernel>
friend 3529 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3530 template<
typename K>
friend 3548 static const int rank = 1;
3557 tiled_index(
const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}
3605 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3607 : global(a), local(b), tile(c), tile_origin(a - b), barrier(pb), tile_dim(D0) {}
3610 __attribute__((annotate(
"__cxxamp_opencl_index")))
3611 #if __KALMAR_ACCELERATOR__ == 1 3612 __attribute__((always_inline))
tiled_index() __HC__
3613 : global(
index<1>(amp_get_global_id(0))),
3614 local(
index<1>(amp_get_local_id(0))),
3615 tile(
index<1>(amp_get_group_id(0))),
3616 tile_origin(
index<1>(amp_get_global_id(0) - amp_get_local_id(0))),
3617 tile_dim(
index<1>(amp_get_local_size(0)))
3618 #elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3619 __attribute__((always_inline))
tiled_index() __CPU__ __HC__
3621 __attribute__((always_inline))
tiled_index() __HC__
3622 #endif // __KALMAR_ACCELERATOR__ 3625 template<
typename Kernel>
friend 3628 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3629 template<
typename K>
friend 3646 static const int rank = 2;
3655 tiled_index(
const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}
3703 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3704 __attribute__((always_inline))
tiled_index(
int a0,
int a1,
int b0,
int b1,
int c0,
int c1,
tile_barrier& pb,
int D0,
int D1) __CPU__ __HC__
3705 : global(a1, a0), local(b1, b0), tile(c1, c0), tile_origin(a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1) {}
3708 __attribute__((annotate(
"__cxxamp_opencl_index")))
3709 #if __KALMAR_ACCELERATOR__ == 1 3710 __attribute__((always_inline))
tiled_index() __HC__
3711 : global(
index<2>(amp_get_global_id(1), amp_get_global_id(0))),
3712 local(
index<2>(amp_get_local_id(1), amp_get_local_id(0))),
3713 tile(
index<2>(amp_get_group_id(1), amp_get_group_id(0))),
3714 tile_origin(
index<2>(amp_get_global_id(1) - amp_get_local_id(1),
3715 amp_get_global_id(0) - amp_get_local_id(0))),
3716 tile_dim(
index<2>(amp_get_local_size(1), amp_get_local_size(0)))
3717 #elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3718 __attribute__((always_inline))
tiled_index() __CPU__ __HC__
3720 __attribute__((always_inline))
tiled_index() __HC__
3721 #endif // __KALMAR_ACCELERATOR__ 3724 template<
typename Kernel>
friend 3727 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3728 template<
typename K>
friend 3733 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 3734 #define SSIZE 1024 * 10 3735 template <
int N,
typename Kernel,
int K>
3738 static inline void call(
const Kernel& k,
index<K>& idx,
const extent<K>& ext) __CPU__ __HC__ {
3740 for (i = 0; i < ext[N]; ++i) {
3742 cpu_helper<N + 1, Kernel, K>::call(k, idx, ext);
3746 template <
typename Kernel,
int K>
3747 struct cpu_helper<K, Kernel, K>
3749 static inline void call(
const Kernel& k,
const index<K>& idx,
const extent<K>& ext) __CPU__ __HC__ {
3750 (
const_cast<Kernel&
>(k))(idx);
3754 template <
typename Kernel,
int N>
3755 void partitioned_task(
const Kernel& ker,
const extent<N>& ext,
int part) {
3757 int start = ext[0] * part / Kalmar::NTHREAD;
3758 int end = ext[0] * (part + 1) / Kalmar::NTHREAD;
3759 for (
int i = start; i < end; i++) {
3761 cpu_helper<1, Kernel, N>::call(ker, idx, ext);
3765 template <
typename Kernel>
3766 void partitioned_task_tile_1D(Kernel
const& f,
tiled_extent<1> const& ext,
int part) {
3768 int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
3769 int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
3770 int stride = end - start;
3773 char *stk =
new char[D0 * SSIZE];
3775 tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0);
3777 for (
int tx = start; tx < end; tx++) {
3781 for (
int x = 0; x < D0; x++) {
3783 hc_bar->setctx(++
id, sp, f, tip, SSIZE);
3788 while (hc_bar->idx == 0) {
3790 hc_bar->swap(0,
id);
3797 template <
typename Kernel>
3798 void partitioned_task_tile_2D(Kernel
const& f,
tiled_extent<2> const& ext,
int part) {
3801 int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
3802 int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
3803 int stride = end - start;
3806 char *stk =
new char[D1 * D0 * SSIZE];
3808 tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0 * D1);
3811 for (
int tx = 0; tx < ext[1] / D1; tx++)
3812 for (
int ty = start; ty < end; ty++) {
3816 for (
int x = 0; x < D1; x++)
3817 for (
int y = 0; y < D0; y++) {
3818 new (tip)
tiled_index<2>(D1 * tx + x, D0 * ty + y, x, y, tx, ty, tbar, D0, D1);
3819 hc_bar->setctx(++
id, sp, f, tip, SSIZE);
3824 while (hc_bar->idx == 0) {
3826 hc_bar->swap(0,
id);
3833 template <
typename Kernel>
3834 void partitioned_task_tile_3D(Kernel
const& f,
tiled_extent<3> const& ext,
int part) {
3838 int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
3839 int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
3840 int stride = end - start;
3843 char *stk =
new char[D2 * D1 * D0 * SSIZE];
3845 tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0 * D1 * D2);
3848 for (
int i = 0; i < ext[2] / D2; i++)
3849 for (
int j = 0; j < ext[1] / D1; j++)
3850 for(
int k = start; k < end; k++) {
3854 for (
int x = 0; x < D2; x++)
3855 for (
int y = 0; y < D1; y++)
3856 for (
int z = 0; z < D0; z++) {
3860 x, y, z, i, j, k, tbar, D0, D1, D2);
3861 hc_bar->setctx(++
id, sp, f, tip, SSIZE);
3866 while (hc_bar->idx == 0) {
3868 hc_bar->swap(0,
id);
3875 template <
typename Kernel,
int N>
3876 completion_future launch_cpu_task_async(
const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel
const& f,
3879 Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
3880 for (
int i = 0; i < Kalmar::NTHREAD; ++i)
3881 obj[i] = std::thread(partitioned_task<Kernel, N>, std::cref(f), std::cref(compute_domain), i);
3886 template <
typename Kernel>
3887 completion_future launch_cpu_task_async(
const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel
const& f,
3890 Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
3891 for (
int i = 0; i < Kalmar::NTHREAD; ++i)
3892 obj[i] = std::thread(partitioned_task_tile_1D<Kernel>,
3893 std::cref(f), std::cref(compute_domain), i);
3898 template <
typename Kernel>
3899 completion_future launch_cpu_task_async(
const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel
const& f,
3902 Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
3903 for (
int i = 0; i < Kalmar::NTHREAD; ++i)
3904 obj[i] = std::thread(partitioned_task_tile_2D<Kernel>,
3905 std::cref(f), std::cref(compute_domain), i);
3910 template <
typename Kernel>
3911 completion_future launch_cpu_task_async(
const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel
const& f,
3914 Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
3915 for (
int i = 0; i < Kalmar::NTHREAD; ++i)
3916 obj[i] = std::thread(partitioned_task_tile_3D<Kernel>,
3917 std::cref(f), std::cref(compute_domain), i);
3928 template <
typename T,
int N>
3933 static_assert(N > 1,
"projection_helper is only supported on array_view with a rank of 2 or higher");
3935 static result_type project(
array_view<T, N>& now,
int stride) __CPU__ __HC__ {
3936 int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
3937 for (i = N - 1; i > 0; --i) {
3938 ext_o[i - 1] = now.extent[i];
3939 ext[i - 1] = now.extent_base[i];
3940 idx[i - 1] = now.index_base[i];
3942 stride += now.index_base[0];
3943 extent<N - 1> ext_now(ext_o);
3944 extent<N - 1> ext_base(ext);
3945 index<N - 1> idx_base(idx);
3946 return result_type (now.cache, ext_now, ext_base, idx_base,
3947 now.offset + ext_base.size() * stride);
3949 static result_type project(
const array_view<T, N>& now,
int stride) __CPU__ __HC__ {
3950 int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
3951 for (i = N - 1; i > 0; --i) {
3952 ext_o[i - 1] = now.extent[i];
3953 ext[i - 1] = now.extent_base[i];
3954 idx[i - 1] = now.index_base[i];
3956 stride += now.index_base[0];
3957 extent<N - 1> ext_now(ext_o);
3958 extent<N - 1> ext_base(ext);
3959 index<N - 1> idx_base(idx);
3960 return result_type (now.cache, ext_now, ext_base, idx_base,
3961 now.offset + ext_base.size() * stride);
3965 template <
typename T>
3970 typedef T& result_type;
3972 #if __KALMAR_ACCELERATOR__ != 1 3973 now.cache.get_cpu_access(
true);
3975 T *ptr =
reinterpret_cast<T *
>(now.cache.get() + i + now.offset + now.index_base[0]);
3978 static result_type project(
const array_view<T, 1>& now,
int i) __CPU__ __HC__ {
3979 #if __KALMAR_ACCELERATOR__ != 1 3980 now.cache.get_cpu_access(
true);
3982 T *ptr =
reinterpret_cast<T *
>(now.cache.get() + i + now.offset + now.index_base[0]);
3987 template <
typename T,
int N>
3992 static_assert(N > 1,
"projection_helper is only supported on array_view with a rank of 2 or higher");
3995 int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
3996 for (i = N - 1; i > 0; --i) {
3997 ext_o[i - 1] = now.extent[i];
3998 ext[i - 1] = now.extent_base[i];
3999 idx[i - 1] = now.index_base[i];
4001 stride += now.index_base[0];
4002 extent<N - 1> ext_now(ext_o);
4003 extent<N - 1> ext_base(ext);
4004 index<N - 1> idx_base(idx);
4005 auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
4006 now.offset + ext_base.size() * stride);
4010 int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
4011 for (i = N - 1; i > 0; --i) {
4012 ext_o[i - 1] = now.extent[i];
4013 ext[i - 1] = now.extent_base[i];
4014 idx[i - 1] = now.index_base[i];
4016 stride += now.index_base[0];
4017 extent<N - 1> ext_now(ext_o);
4018 extent<N - 1> ext_base(ext);
4019 index<N - 1> idx_base(idx);
4020 auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
4021 now.offset + ext_base.size() * stride);
4026 template <
typename T>
4031 typedef const T& const_result_type;
4033 #if __KALMAR_ACCELERATOR__ != 1 4034 now.cache.get_cpu_access();
4036 const T *ptr =
reinterpret_cast<const T *
>(now.cache.get() + i + now.offset + now.index_base[0]);
4040 #if __KALMAR_ACCELERATOR__ != 1 4041 now.cache.get_cpu_access();
4043 const T *ptr =
reinterpret_cast<const T *
>(now.cache.get() + i + now.offset + now.index_base[0]);
4052 template <
typename T>
4056 struct two {
char __lx;
char __lxx;};
4057 template <
typename C>
static char test(decltype(std::declval<C>().data()));
4058 template <
typename C>
static two test(...);
4060 static const bool value =
sizeof(test<T>(0)) == 1;
4063 template <
typename T>
4067 struct two {
char __lx;
char __lxx;};
4068 template <
typename C>
static char test(decltype(&C::size));
4069 template <
typename C>
static two test(...);
4071 static const bool value =
sizeof(test<T>(0)) == 1;
4074 template <
typename T>
4077 using _T =
typename std::remove_reference<T>::type;
4086 template <
typename T,
int N>
4092 static_assert(N > 1,
"projection_helper is only supported on array with a rank of 2 or higher");
4095 static result_type project(
array<T, N>& now,
int stride) __CPU__ __HC__ {
4096 #if __KALMAR_ACCELERATOR__ != 1 4101 for (i = N - 1; i > 0; --i)
4102 comp[i - 1] = now.extent[i];
4104 int offset = ext.
size() * stride;
4105 #if __KALMAR_ACCELERATOR__ != 1 4106 if( offset >= now.extent.size())
4109 return result_type(now.m_device, ext, ext,
index<N - 1>(), offset);
4111 static const_result_type project(
const array<T, N>& now,
int stride) __CPU__ __HC__ {
4113 for (i = N - 1; i > 0; --i)
4114 comp[i - 1] = now.extent[i];
4116 int offset = ext.
size() * stride;
4117 return const_result_type(now.m_device, ext, ext,
index<N - 1>(), offset);
4121 template <
typename T>
4127 typedef T& result_type;
4128 typedef const T& const_result_type;
4129 static result_type project(
array<T, 1>& now,
int i) __CPU__ __HC__ {
4130 #if __KALMAR_ACCELERATOR__ != 1 4131 now.m_device.synchronize(
true);
4133 T *ptr =
reinterpret_cast<T *
>(now.m_device.get() + i);
4136 static const_result_type project(
const array<T, 1>& now,
int i) __CPU__ __HC__ {
4137 #if __KALMAR_ACCELERATOR__ != 1 4138 now.m_device.synchronize();
4140 const T *ptr =
reinterpret_cast<const T *
>(now.m_device.get() + i);
4148 #if __KALMAR_ACCELERATOR__ != 1 4149 for (
int i = 0; i < N; i++)
4162 template <
typename T,
int N>
4165 template <
typename T,
int N>
4168 template <
typename T,
int N>
4171 template <
typename T,
int N>
4174 template <
typename T,
int N>
4177 template <
typename T,
int N>
4180 template <
typename InputIter,
typename T,
int N>
4183 template <
typename InputIter,
typename T,
int N>
4186 template <
typename InputIter,
typename T,
int N>
4189 template <
typename InputIter,
typename T,
int N>
4192 template <
typename OutputIter,
typename T,
int N>
4195 template <
typename OutputIter,
typename T,
int N>
4209 template <
typename T,
int N = 1>
4211 static_assert(!std::is_const<T>::value,
"array<const T> is not supported");
4213 #if __KALMAR_ACCELERATOR__ == 1 4214 typedef Kalmar::_data<T> acc_buffer_t;
4216 typedef Kalmar::_data_host<T> acc_buffer_t;
4222 static const int rank = N;
4243 :
array(other.get_extent(), other.get_accelerator_view())
4244 {
copy(other, *
this); }
4255 { other.m_device.reset(); }
4275 :
array(
hc::
extent<N>(e0)) { static_assert(N == 1,
"illegal"); }
4297 template <
typename InputIter>
4300 template <
typename InputIter>
4302 :
array(ext, srcBegin, srcEnd,
accelerator(L
"default").get_default_view()) {}
4316 template <
typename InputIter>
4319 template <
typename InputIter>
4320 array(
int e0, InputIter srcBegin, InputIter srcEnd)
4322 template <
typename InputIter>
4323 array(
int e0,
int e1, InputIter srcBegin)
4325 template <
typename InputIter>
4326 array(
int e0,
int e1, InputIter srcBegin, InputIter srcEnd)
4328 template <
typename InputIter>
4329 array(
int e0,
int e1,
int e2, InputIter srcBegin)
4331 template <
typename InputIter>
4332 array(
int e0,
int e1,
int e2, InputIter srcBegin, InputIter srcEnd)
4350 {
copy(src, *
this); }
4375 #if __KALMAR_ACCELERATOR__ == 1 4378 : m_device(av.pQueue, av.pQueue, check(ext).size(), cpu_access_type),
extent(ext) {}
4385 explicit array(
int e0,
void* accelerator_pointer)
4387 explicit array(
int e0,
int e1,
void* accelerator_pointer)
4389 explicit array(
int e0,
int e1,
int e2,
void* accelerator_pointer)
4393 :
array(ext,
accelerator(L
"default").get_default_view(), accelerator_pointer) {}
4406 #if __KALMAR_ACCELERATOR__ == 1 4407 : m_device(ext.
size(), accelerator_pointer),
extent(ext) {}
4409 : m_device(av.pQueue, av.pQueue, check(ext).size(), accelerator_pointer, cpu_access_type),
extent(ext) {}
4459 template <
typename InputIter>
4461 access_type cpu_access_type = access_type_auto)
4462 :
array(ext, av, cpu_access_type) {
copy(srcBegin, *
this); }
4463 template <
typename InputIter>
4464 array(
const extent<N>& ext, InputIter srcBegin, InputIter srcEnd,
4466 :
array(ext, av, cpu_access_type) {
4467 if (ext.
size() < std::distance(srcBegin, srcEnd))
4469 copy(srcBegin, srcEnd, *
this);
4501 :
array(src.get_extent(), av, cpu_access_type) {
copy(src, *
this); }
4516 template <
typename InputIter>
4518 :
array(
extent<N>(e0), srcBegin, av, cpu_access_type) {}
4519 template <
typename InputIter>
4520 array(
int e0, InputIter srcBegin, InputIter srcEnd,
accelerator_view av, access_type cpu_access_type = access_type_auto)
4521 :
array(
extent<N>(e0), srcBegin, srcEnd, av, cpu_access_type) {}
4522 template <
typename InputIter>
4524 :
array(
hc::
extent<N>(e0, e1), srcBegin, av, cpu_access_type) {}
4525 template <
typename InputIter>
4526 array(
int e0,
int e1, InputIter srcBegin, InputIter srcEnd,
accelerator_view av, access_type cpu_access_type = access_type_auto)
4527 :
array(
hc::
extent<N>(e0, e1), srcBegin, srcEnd, av, cpu_access_type) {}
4528 template <
typename InputIter>
4530 :
array(
hc::
extent<N>(e0, e1, e2), srcBegin, av, cpu_access_type) {}
4531 template <
typename InputIter>
4532 array(
int e0,
int e1,
int e2, InputIter srcBegin, InputIter srcEnd,
accelerator_view av, access_type cpu_access_type = access_type_auto)
4533 :
array(
hc::
extent<N>(e0, e1, e2), srcBegin, srcEnd, av, cpu_access_type) {}
4550 #if __KALMAR_ACCELERATOR__ == 1 4553 : m_device(av.pQueue, associated_av.pQueue, check(ext).size(), access_type_auto),
extent(ext) {}
4592 template <
typename InputIter>
4594 :
array(ext, av, associated_av) {
copy(srcBegin, *
this); }
4595 template <
typename InputIter>
4597 :
array(ext, av, associated_av) {
4598 if (ext.
size() < std::distance(srcBegin, srcEnd))
4600 copy(srcBegin, srcEnd, *
this);
4622 :
array(src.get_extent(), av, associated_av)
4623 {
copy(src, *
this); }
4639 template <
typename InputIter>
4641 :
array(
extent<N>(e0), srcBegin, av, associated_av) {}
4642 template <
typename InputIter>
4644 :
array(
extent<N>(e0), srcBegin, srcEnd, av, associated_av) {}
4645 template <
typename InputIter>
4647 :
array(
hc::
extent<N>(e0, e1), srcBegin, av, associated_av) {}
4648 template <
typename InputIter>
4650 :
array(
hc::
extent<N>(e0, e1), srcBegin, srcEnd, av, associated_av) {}
4651 template <
typename InputIter>
4653 :
array(
hc::
extent<N>(e0, e1, e2), srcBegin, av, associated_av) {}
4654 template <
typename InputIter>
4656 :
array(
hc::
extent<N>(e0, e1, e2), srcBegin, srcEnd, av, associated_av) {}
4691 if (
this != &other) {
4693 *
this = std::move(arr);
4706 if (
this != &other) {
4708 m_device = other.m_device;
4709 other.m_device.reset();
4724 *
this = std::move(arr);
4736 #if __KALMAR_ACCELERATOR__ != 1 4737 for(
int i = 0 ; i < N ; i++)
4739 if (dest.extent[i] < this->extent[i] )
4761 #if __KALMAR_ACCELERATOR__ != 1 4762 if (!m_device.get())
4764 m_device.synchronize(
true);
4766 return reinterpret_cast<T*
>(m_device.get());
4776 return reinterpret_cast<T*
>(m_device.get_device_pointer());
4786 operator std::vector<T>()
const {
4788 copy(*
this, vec.data());
4789 return std::move(vec);
4804 #ifndef __KALMAR_ACCELERATOR__ 4805 if (!m_device.get())
4807 m_device.synchronize(
true);
4809 T *ptr =
reinterpret_cast<T*
>(m_device.get());
4813 return (*
this)[idx];
4830 #if __KALMAR_ACCELERATOR__ != 1 4831 if (!m_device.get())
4833 m_device.synchronize();
4835 T *ptr =
reinterpret_cast<T*
>(m_device.get());
4839 return (*
this)[idx];
4856 return (*
this)[
index<3>(i0, i1, i2)];
4872 const T&
operator()(
int i0,
int i1,
int i2)
const __CPU__ __HC__ {
4873 return (*
this)[
index<3>(i0, i1, i2)];
4894 operator[] (
int i) __CPU__ __HC__ {
4902 operator[] (
int i)
const __CPU__ __HC__ {
4931 #if __KALMAR_ACCELERATOR__ != 1 4936 return av.
section(origin, ext);
4940 return av.
section(origin, ext);
4950 #if __KALMAR_ACCELERATOR__ != 1 4990 static_assert(N == 1,
"Rank must be 1");
4994 static_assert(N == 1,
"Rank must be 1");
4998 static_assert(N == 2,
"Rank must be 2");
5002 static_assert(N == 2,
"Rank must be 2");
5006 static_assert(N == 3,
"Rank must be 3");
5010 static_assert(N == 3,
"Rank must be 3");
5037 template <
typename ElementType>
5039 #if __KALMAR_ACCELERATOR__ != 1 5040 static_assert( ! (std::is_pointer<ElementType>::value ),
"can't use pointer in the kernel");
5041 static_assert( ! (std::is_same<ElementType,short>::value ),
"can't use short in the kernel");
5042 if( (
extent.
size() *
sizeof(T)) %
sizeof(ElementType))
5045 int size =
extent.
size() *
sizeof(T) /
sizeof(ElementType);
5046 using buffer_type =
typename array_view<ElementType, 1>::acc_buffer_t;
5050 template <
typename ElementType>
5052 #if __KALMAR_ACCELERATOR__ != 1 5053 static_assert( ! (std::is_pointer<ElementType>::value ),
"can't use pointer in the kernel");
5054 static_assert( ! (std::is_same<ElementType,short>::value ),
"can't use short in the kernel");
5056 int size =
extent.
size() *
sizeof(T) /
sizeof(ElementType);
5057 using buffer_type =
typename array_view<ElementType, 1>::acc_buffer_t;
5079 #if __KALMAR_ACCELERATOR__ != 1 5088 #if __KALMAR_ACCELERATOR__ != 1 5101 const acc_buffer_t&
internal()
const __CPU__ __HC__ {
return m_device; }
5102 int get_offset()
const __CPU__ __HC__ {
return 0; }
5107 acc_buffer_t m_device;
5110 template <
typename Q,
int K>
friend 5112 template <
typename Q,
int K>
friend 5126 template <
typename T,
int N = 1>
5130 typedef typename std::remove_const<T>::type nc_T;
5131 #if __KALMAR_ACCELERATOR__ == 1 5132 typedef Kalmar::_data<T> acc_buffer_t;
5134 typedef Kalmar::_data_host<T> acc_buffer_t;
5140 static const int rank = N;
5179 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5182 { static_assert( std::is_same<decltype(src.data()), T*>::value,
"container element type and array view element type must match"); }
5195 #if __KALMAR_ACCELERATOR__ == 1 5196 : cache((T *)(src)),
extent(ext), extent_base(ext), offset(0) {}
5198 : cache(ext.
size(), (T *)(src)),
extent(ext), extent_base(ext), offset(0) {}
5212 : cache(ext.size()),
extent(ext), extent_base(ext), offset(0) {}
5224 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5227 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5230 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5231 array_view(
int e0,
int e1,
int e2, Container& src)
5246 array_view(
int e0,
int e1, value_type *src) __CPU__ __HC__
5248 array_view(
int e0,
int e1,
int e2, value_type *src) __CPU__ __HC__
5259 explicit array_view(
int e0,
int e1)
5261 explicit array_view(
int e0,
int e1,
int e2)
5273 : cache(other.cache),
extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
5299 array_view&
operator=(
const array_view& other) __CPU__ __HC__ {
5300 if (
this != &other) {
5301 cache = other.cache;
5303 index_base = other.index_base;
5304 extent_base = other.extent_base;
5305 offset = other.offset;
5318 #if __KALMAR_ACCELERATOR__ != 1 5319 for(
int i= 0 ;i< N;i++)
5354 #if __KALMAR_ACCELERATOR__ != 1 5355 cache.get_cpu_access(
true);
5357 static_assert(N == 1,
"data() is only permissible on array views of rank 1");
5358 return reinterpret_cast<T*
>(cache.get() + offset + index_base[0]);
5368 return reinterpret_cast<T*
>(cache.get_device_pointer() + offset + index_base[0]);
5426 std::future<void> fut = std::async([&]()
mutable { synchronize(); });
5465 #if __KALMAR_ACCELERATOR__ != 1 5466 cache.sync_to(av.pQueue);
5496 #if __KALMAR_ACCELERATOR__ != 1 5509 T& operator[] (
const index<N>& idx)
const __CPU__ __HC__ {
5510 #if __KALMAR_ACCELERATOR__ != 1 5511 cache.get_cpu_access(
true);
5513 T *ptr =
reinterpret_cast<T*
>(cache.get() + offset);
5514 return ptr[Kalmar::amp_helper<N, index<N>,
hc::extent<N>>::flatten(idx + index_base, extent_base)];
5518 return (*
this)[idx];
5535 T& get_ref(
const index<N>& idx)
const __CPU__ __HC__;
5545 T& operator() (
int i0,
int i1)
const __CPU__ __HC__ {
5546 static_assert(N == 2,
"T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
5549 T& operator() (
int i0,
int i1,
int i2)
const __CPU__ __HC__ {
5550 static_assert(N == 3,
"T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
5551 return (*
this)[
index<3>(i0, i1, i2)];
5576 operator[] (
int i)
const __CPU__ __HC__ {
5580 operator() (
int i0)
const __CPU__ __HC__ {
return (*
this)[i0]; }
5603 const extent<N>& ext)
const __CPU__ __HC__ {
5604 #if __KALMAR_ACCELERATOR__ != 1 5617 Kalmar::amp_helper<N, index<N>,
hc::extent<N>>::minus(idx, ext);
5618 return section(idx, ext);
5626 return section(idx, ext);
5640 static_assert(N == 1,
"Rank must be 1");
5645 static_assert(N == 2,
"Rank must be 2");
5650 static_assert(N == 3,
"Rank must be 3");
5667 template <
typename ElementType>
5669 static_assert(N == 1,
"reinterpret_as is only permissible on array views of rank 1");
5670 #if __KALMAR_ACCELERATOR__ != 1 5671 static_assert( ! (std::is_pointer<ElementType>::value ),
"can't use pointer in the kernel");
5672 static_assert( ! (std::is_same<ElementType,short>::value ),
"can't use short in the kernel");
5673 if ( (
extent.
size() *
sizeof(T)) %
sizeof(ElementType))
5676 int size =
extent.
size() *
sizeof(T) /
sizeof(ElementType);
5677 using buffer_type =
typename array_view<ElementType, 1>::acc_buffer_t;
5680 (offset + index_base[0])*
sizeof(T) /
sizeof(ElementType));
5694 static_assert(N == 1,
"view_as is only permissible on array views of rank 1");
5695 #if __KALMAR_ACCELERATOR__ != 1 5703 ~array_view() __CPU__ __HC__ {}
5706 const acc_buffer_t&
internal()
const __CPU__ __HC__ {
return cache; }
5708 int get_offset()
const __CPU__ __HC__ {
return offset; }
5710 index<N> get_index_base()
const __CPU__ __HC__ {
return index_base; }
5715 template <
typename Q,
int K>
friend class array;
5716 template <
typename Q,
int K>
friend class array_view;
5718 template<
typename Q,
int K>
friend 5720 template <
typename Q,
int K>
friend 5722 template <
typename InputIter,
typename Q,
int K>
friend 5724 template <
typename Q,
int K>
friend 5726 template <
typename OutputIter,
typename Q,
int K>
friend 5728 template <
typename Q,
int K>
friend 5732 array_view(
const acc_buffer_t& cache,
const hc::extent<N>& ext,
5733 int offset) __CPU__ __HC__
5734 : cache(cache),
extent(ext), extent_base(ext), offset(offset) {}
5737 array_view(
const acc_buffer_t& cache,
const hc::extent<N>& ext_now,
5739 const index<N>& idx_b,
int off) __CPU__ __HC__
5740 : cache(cache),
extent(ext_now), extent_base(ext_b), index_base(idx_b),
5761 template <
typename T,
int N>
5762 class array_view<const T, N>
5765 typedef typename std::remove_const<T>::type nc_T;
5767 #if __KALMAR_ACCELERATOR__ == 1 5768 typedef Kalmar::_data<nc_T> acc_buffer_t;
5770 typedef Kalmar::_data_host<const T> acc_buffer_t;
5776 static const int rank = N;
5786 array_view() =
delete;
5797 : cache(src.internal()),
extent(src.get_extent()), extent_base(
extent), index_base(), offset(0) {}
5815 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5817 : array_view(extent, src.data())
5818 { static_assert( std::is_same<
typename std::remove_const<
typename std::remove_reference<decltype(*src.data())>::type>::type, T>::value,
"container element type and array view element type must match"); }
5831 #if __KALMAR_ACCELERATOR__ == 1 5832 : cache((nc_T*)(src)),
extent(ext), extent_base(ext), offset(0) {}
5834 : cache(ext.
size(), src),
extent(ext), extent_base(ext), offset(0) {}
5847 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5849 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5850 array_view(
int e0,
int e1, Container& src)
5852 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
5853 array_view(
int e0,
int e1,
int e2, Container& src)
5868 array_view(
int e0,
int e1,
const value_type *src) __CPU__ __HC__
5870 array_view(
int e0,
int e1,
int e2,
const value_type *src) __CPU__ __HC__
5882 : cache(other.cache),
extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
5892 : cache(other.cache),
extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
5920 cache = other.cache;
5922 index_base = other.index_base;
5923 extent_base = other.extent_base;
5924 offset = other.offset;
5928 array_view&
operator=(
const array_view& other) __CPU__ __HC__ {
5929 if (
this != &other) {
5930 cache = other.cache;
5932 index_base = other.index_base;
5933 extent_base = other.extent_base;
5934 offset = other.offset;
5974 const T*
data() const __CPU__ __HC__ {
5975 #if __KALMAR_ACCELERATOR__ != 1 5976 cache.get_cpu_access();
5978 static_assert(N == 1,
"data() is only permissible on array views of rank 1");
5979 return reinterpret_cast<const T*
>(cache.get() + offset + index_base[0]);
5989 return reinterpret_cast<const T*
>(cache.get_device_pointer() + offset + index_base[0]);
6041 std::future<void> fut = std::async([&]()
mutable { synchronize(); });
6057 #if __KALMAR_ACCELERATOR__ != 1 6058 cache.sync_to(av.pQueue);
6089 #if __KALMAR_ACCELERATOR__ != 1 6090 cache.get_cpu_access();
6092 const T *ptr =
reinterpret_cast<const T*
>(cache.get() + offset);
6093 return ptr[Kalmar::amp_helper<N, index<N>,
hc::extent<N>>::flatten(idx + index_base, extent_base)];
6096 return (*
this)[idx];
6113 const T& get_ref(
const index<N>& idx)
const __CPU__ __HC__;
6124 static_assert(N == 1,
"const T& array_view::operator()(int) is only permissible on array_view<T, 1>");
6129 static_assert(N == 2,
"const T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
6132 const T&
operator()(
int i0,
int i1,
int i2)
const __CPU__ __HC__ {
6133 static_assert(N == 3,
"const T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
6134 return (*
this)[
index<3>(i0, i1, i2)];
6159 operator[] (
int i)
const __CPU__ __HC__ {
6188 const extent<N>& ext)
const __CPU__ __HC__ {
6198 Kalmar::amp_helper<N, index<N>,
hc::extent<N>>::minus(idx, ext);
6199 return section(idx, ext);
6207 return section(idx, ext);
6221 static_assert(N == 1,
"Rank must be 1");
6226 static_assert(N == 2,
"Rank must be 2");
6231 static_assert(N == 3,
"Rank must be 3");
6248 template <
typename ElementType>
6250 static_assert(N == 1,
"reinterpret_as is only permissible on array views of rank 1");
6251 #if __KALMAR_ACCELERATOR__ != 1 6252 static_assert( ! (std::is_pointer<ElementType>::value ),
"can't use pointer in the kernel");
6253 static_assert( ! (std::is_same<ElementType,short>::value ),
"can't use short in the kernel");
6255 int size =
extent.
size() *
sizeof(T) /
sizeof(ElementType);
6256 using buffer_type =
typename array_view<ElementType, 1>::acc_buffer_t;
6259 (offset + index_base[0])*
sizeof(T) /
sizeof(ElementType));
6273 static_assert(N == 1,
"view_as is only permissible on array views of rank 1");
6274 #if __KALMAR_ACCELERATOR__ != 1 6282 ~array_view() __CPU__ __HC__ {}
6285 const acc_buffer_t&
internal()
const __CPU__ __HC__ {
return cache; }
6287 int get_offset()
const __CPU__ __HC__ {
return offset; }
6289 index<N> get_index_base()
const __CPU__ __HC__ {
return index_base; }
6294 template <
typename Q,
int K>
friend class array;
6295 template <
typename Q,
int K>
friend class array_view;
6297 template<
typename Q,
int K>
friend 6299 template <
typename Q,
int K>
friend 6301 template <
typename InputIter,
typename Q,
int K>
6303 template <
typename Q,
int K>
friend 6305 template <
typename OutputIter,
typename Q,
int K>
friend 6307 template <
typename Q,
int K>
friend 6311 array_view(
const acc_buffer_t& cache,
const hc::extent<N>& ext,
6312 int offset) __CPU__ __HC__
6313 : cache(cache),
extent(ext), extent_base(ext), offset(offset) {}
6316 array_view(
const acc_buffer_t& cache,
const hc::extent<N>& ext_now,
6318 const index<N>& idx_b,
int off) __CPU__ __HC__
6319 : cache(cache),
extent(ext_now), extent_base(ext_b), index_base(idx_b),
6333 template<
typename T,
int N>
6335 return av.extent == av.extent_base && av.index_base ==
index<N>();
6338 template<
typename T>
6339 static inline bool is_flat(
const array_view<T, 1>& av) noexcept {
return true; }
6341 template <
typename InputIter,
typename T,
int N,
int dim>
6344 void operator()(InputIter& It, T* ptr,
const extent<N>& ext,
6348 for (
int i = dim; i < N; i++)
6350 ptr += stride * idx[dim - 1];
6351 for (
int i = 0; i < ext[dim - 1]; i++) {
6358 template <
typename InputIter,
typename T,
int N>
6361 void operator()(InputIter& It, T* ptr,
const extent<N>& ext,
6365 std::advance(end, ext[N - 1]);
6371 template <
typename OutputIter,
typename T,
int N,
int dim>
6374 void operator()(
const T* ptr, OutputIter& It,
const extent<N>& ext,
6378 for (
int i = dim; i < N; i++)
6380 ptr += stride * idx[dim - 1];
6381 for (
int i = 0; i < ext[dim - 1]; i++) {
6388 template <
typename OutputIter,
typename T,
int N>
6391 void operator()(
const T* ptr, OutputIter& It,
const extent<N>& ext,
6395 It =
std::copy(ptr, ptr + ext[N - 1], It);
6399 template <
typename T,
int N,
int dim>
6402 void operator()(
const T* src, T* dst,
const extent<N>& ext,
6407 for (
int i = dim; i < N; i++)
6408 stride1 *= base1[i];
6409 src += stride1 * idx1[dim - 1];
6412 for (
int i = dim; i < N; i++)
6413 stride2 *= base2[i];
6414 dst += stride2 * idx2[dim - 1];
6416 for (
int i = 0; i < ext[dim - 1]; i++) {
6424 template <
typename T,
int N>
6427 void operator()(
const T* src, T* dst,
const extent<N>& ext,
6437 template <
typename Iter,
typename T,
int N>
6440 template<
template <
typename,
int>
class _amp_container>
6441 void operator()(Iter srcBegin, Iter srcEnd,
const _amp_container<T, N>& dest) {
6442 size_t size = dest.get_extent().size();
6443 size_t offset = dest.get_offset();
6446 T* ptr = dest.internal().map_ptr(modify, size, offset);
6448 dest.internal().unmap_ptr(ptr, modify, size, offset);
6450 template<
template <
typename,
int>
class _amp_container>
6451 void operator()(
const _amp_container<T, N> &src, Iter destBegin) {
6452 size_t size = src.get_extent().size();
6453 size_t offset = src.get_offset();
6454 bool modify =
false;
6456 const T* ptr = src.internal().map_ptr(modify, size, offset);
6457 std::copy(ptr, ptr + src.get_extent().size(), destBegin);
6458 src.internal().unmap_ptr(ptr, modify, size, offset);
6462 template <
typename Iter,
typename T>
6465 template<
template <
typename,
int>
class _amp_container>
6466 void operator()(Iter srcBegin, Iter srcEnd,
const _amp_container<T, 1>& dest) {
6467 size_t size = dest.get_extent().size();
6468 size_t offset = dest.get_offset() + dest.get_index_base()[0];
6471 T* ptr = dest.internal().map_ptr(modify, size, offset);
6473 dest.internal().unmap_ptr(ptr, modify, size, offset);
6475 template<
template <
typename,
int>
class _amp_container>
6476 void operator()(
const _amp_container<T, 1> &src, Iter destBegin) {
6477 size_t size = src.get_extent().size();
6478 size_t offset = src.get_offset() + src.get_index_base()[0];
6479 bool modify =
false;
6481 const T* ptr = src.internal().map_ptr(modify, size, offset);
6482 std::copy(ptr, ptr + src.get_extent().size(), destBegin);
6483 src.internal().unmap_ptr(ptr, modify, size, offset);
6487 template <
typename T,
int N>
6490 template<
template <
typename,
int>
class _amp_container>
6491 void operator()(T* srcBegin, T* srcEnd,
const _amp_container<T, N>& dest) {
6492 dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd), dest.get_offset(),
true);
6494 template<
template <
typename,
int>
class _amp_container>
6495 void operator()(
const _amp_container<T, N> &src, T* destBegin) {
6496 src.internal().read(destBegin, src.get_extent().size(), src.get_offset());
6500 template <
typename T>
6503 template<
template <
typename,
int>
class _amp_container>
6504 void operator()(
const T* srcBegin,
const T* srcEnd,
const _amp_container<T, 1>& dest) {
6505 dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd),
6506 dest.get_offset() + dest.get_index_base()[0],
true);
6508 template<
template <
typename,
int>
class _amp_container>
6509 void operator()(
const _amp_container<T, 1> &src, T* destBegin) {
6510 src.internal().read(destBegin, src.get_extent().size(),
6511 src.get_offset() + src.get_index_base()[0]);
6527 template <
typename T,
int N>
6529 src.internal().copy(dest.internal(), 0, 0, 0);
6540 template <
typename T,
int N>
6543 src.internal().copy(dest.internal(), src.get_offset(),
6544 dest.get_offset(), dest.
get_extent().size());
6547 size_t srcSize = src.extent.size();
6548 size_t srcOffset = 0;
6549 bool srcModify =
false;
6550 size_t destSize = dest.extent_base.size();
6551 size_t destOffset = dest.offset;
6552 bool destModify =
true;
6554 T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
6556 T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
6558 dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
6559 src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
6563 template <
typename T>
6565 src.internal().copy(dest.internal(),
6566 src.get_offset() + src.get_index_base()[0],
6567 dest.get_offset() + dest.get_index_base()[0],
6582 template <
typename T,
int N>
6585 src.internal().copy(dest.internal(), src.get_offset(),
6586 dest.get_offset(), dest.
get_extent().size());
6589 size_t srcSize = src.extent_base.size();
6590 size_t srcOffset = src.offset;
6591 bool srcModify =
false;
6592 size_t destSize = dest.extent.size();
6593 size_t destOffset = 0;
6594 bool destModify =
true;
6596 T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
6598 const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
6600 src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
6601 dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
6605 template <
typename T,
int N>
6611 template <
typename T>
6613 src.internal().copy(dest.internal(),
6614 src.get_offset() + src.get_index_base()[0],
6615 dest.get_offset() + dest.get_index_base()[0],
6630 template <
typename T,
int N>
6634 src.internal().copy(dest.internal(), src.get_offset(),
6635 dest.get_offset(), dest.
get_extent().size());
6638 size_t srcSize = src.extent.size();
6639 size_t srcOffset = 0;
6640 bool srcModify =
false;
6641 size_t destSize = dest.extent_base.size();
6642 size_t destOffset = dest.offset;
6643 bool destModify =
true;
6645 const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
6647 T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
6649 dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
6650 src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
6653 if (is_flat(dest)) {
6655 size_t srcSize = src.extent_base.size();
6656 size_t srcOffset = src.offset;
6657 bool srcModify =
false;
6658 size_t destSize = dest.extent.size();
6659 size_t destOffset = 0;
6660 bool destModify =
true;
6662 T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
6664 const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
6666 dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
6667 src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
6670 size_t srcSize = src.extent_base.size();
6671 size_t srcOffset = src.offset;
6672 bool srcModify =
false;
6673 size_t destSize = dest.extent_base.size();
6674 size_t destOffset = dest.offset;
6675 bool destModify =
true;
6677 const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
6678 T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
6680 src.index_base, dest.extent_base, dest.index_base);
6681 dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
6682 src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
6687 template <
typename T,
int N>
6693 template <
typename T>
6695 src.internal().copy(dest.internal(),
6696 src.get_offset() + src.get_index_base()[0],
6697 dest.get_offset() + dest.get_index_base()[0],
6717 template <
typename InputIter,
typename T,
int N>
6719 #if __KALMAR_ACCELERATOR__ != 1 6720 if( ( std::distance(srcBegin,srcEnd) <=0 )||( std::distance(srcBegin,srcEnd) < dest.
get_extent().size() ))
6726 template <
typename InputIter,
typename T,
int N>
6728 InputIter srcEnd = srcBegin;
6729 std::advance(srcEnd, dest.
get_extent().size());
6730 copy(srcBegin, srcEnd, dest);
6749 template <
typename InputIter,
typename T,
int N>
6754 size_t size = dest.extent_base.size();
6755 size_t offset = dest.offset;
6758 T* ptr = dest.internal().map_ptr(modify, size, offset);
6760 dest.internal().unmap_ptr(ptr, modify, size, offset);
6764 template <
typename InputIter,
typename T,
int N>
6766 InputIter srcEnd = srcBegin;
6767 std::advance(srcEnd, dest.
get_extent().size());
6768 copy(srcBegin, srcEnd, dest);
6783 template <
typename OutputIter,
typename T,
int N>
6798 template <
typename OutputIter,
typename T,
int N>
6803 size_t size = src.extent_base.size();
6804 size_t offset = src.offset;
6805 bool modify =
false;
6807 T* ptr = src.internal().map_ptr(modify, size, offset);
6809 src.internal().unmap_ptr(ptr, modify, size, offset);
6830 template <
typename T,
int N>
6832 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
6843 template <
typename T,
int N>
6845 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
6858 template <
typename T,
int N>
6860 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
6864 template <
typename T,
int N>
6866 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
6881 template <
typename T,
int N>
6883 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
6887 template <
typename T,
int N>
6889 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
6909 template <
typename InputIter,
typename T,
int N>
6911 std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]()
mutable {
copy(srcBegin, srcEnd, dest); });
6915 template <
typename InputIter,
typename T,
int N>
6917 std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]()
mutable {
copy(srcBegin, dest); });
6937 template <
typename InputIter,
typename T,
int N>
6939 std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]()
mutable {
copy(srcBegin, srcEnd, dest); });
6943 template <
typename InputIter,
typename T,
int N>
6945 std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]()
mutable {
copy(srcBegin, dest); });
6961 template <
typename OutputIter,
typename T,
int N>
6963 std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]()
mutable {
copy(src, destBegin); });
6977 template <
typename OutputIter,
typename T,
int N>
6979 std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]()
mutable {
copy(src, destBegin); });
6985 template <
typename T,
int N>
6987 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
6991 template <
typename T,
int N>
6993 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
6997 template <
typename T,
int N>
6999 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
7022 #if __KALMAR_ACCELERATOR__ == 1 7023 extern "C" unsigned int atomic_exchange_unsigned(
unsigned int *p,
unsigned int val) __HC__;
7024 extern "C" int atomic_exchange_int(
int *p,
int val) __HC__;
7025 extern "C" float atomic_exchange_float(
float *p,
float val) __HC__;
7026 extern "C" uint64_t atomic_exchange_uint64(uint64_t *p, uint64_t val) __HC__;
7028 static inline unsigned int atomic_exchange(
unsigned int * dest,
unsigned int val) __CPU__ __HC__ {
7029 return atomic_exchange_unsigned(dest, val);
7031 static inline int atomic_exchange(
int * dest,
int val) __CPU__ __HC__ {
7032 return atomic_exchange_int(dest, val);
7034 static inline float atomic_exchange(
float * dest,
float val) __CPU__ __HC__ {
7035 return atomic_exchange_float(dest, val);
7037 static inline uint64_t
atomic_exchange(uint64_t * dest, uint64_t val) __CPU__ __HC__ {
7038 return atomic_exchange_uint64(dest, val);
7040 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7041 unsigned int atomic_exchange_unsigned(
unsigned int *p,
unsigned int val);
7042 int atomic_exchange_int(
int *p,
int val);
7043 float atomic_exchange_float(
float *p,
float val);
7044 uint64_t atomic_exchange_uint64(uint64_t *p, uint64_t val);
7046 static inline unsigned int atomic_exchange(
unsigned int *dest,
unsigned int val) __CPU__ __HC__ {
7047 return atomic_exchange_unsigned(dest, val);
7049 static inline int atomic_exchange(
int *dest,
int val) __CPU__ __HC__ {
7050 return atomic_exchange_int(dest, val);
7052 static inline float atomic_exchange(
float *dest,
float val) __CPU__ __HC__ {
7053 return atomic_exchange_float(dest, val);
7055 static inline uint64_t
atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__ {
7056 return atomic_exchange_uint64(dest, val);
7059 extern unsigned int atomic_exchange(
unsigned int *dest,
unsigned int val) __CPU__ __HC__;
7062 extern uint64_t
atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__;
7097 #if __KALMAR_ACCELERATOR__ == 1 7098 extern "C" unsigned int atomic_compare_exchange_unsigned(
unsigned int *dest,
unsigned int expected_val,
unsigned int val) __HC__;
7099 extern "C" int atomic_compare_exchange_int(
int *dest,
int expected_val,
int val) __HC__;
7100 extern "C" uint64_t atomic_compare_exchange_uint64(uint64_t *dest, uint64_t expected_val, uint64_t val) __HC__;
7102 static inline bool atomic_compare_exchange(
unsigned int *dest,
unsigned int *expected_val,
unsigned int val) __CPU__ __HC__ {
7103 *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
7104 return (*dest == val);
7107 *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
7108 return (*dest == val);
7110 static inline bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__ {
7111 *expected_val = atomic_compare_exchange_uint64(dest, *expected_val, val);
7112 return (*dest == val);
7114 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7115 unsigned int atomic_compare_exchange_unsigned(
unsigned int *dest,
unsigned int expected_val,
unsigned int val);
7116 int atomic_compare_exchange_int(
int *dest,
int expected_val,
int val);
7117 uint64_t atomic_compare_exchange_uint64(uint64_t *dest, uint64_t expected_val, uint64_t val);
7119 static inline bool atomic_compare_exchange(
unsigned int *dest,
unsigned int *expected_val,
unsigned int val) __CPU__ __HC__ {
7120 *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
7121 return (*dest == val);
7124 *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
7125 return (*dest == val);
7127 static inline bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__ {
7128 *expected_val = atomic_compare_exchange_uint64(dest, *expected_val, val);
7129 return (*dest == val);
7132 extern bool atomic_compare_exchange(
unsigned int *dest,
unsigned int *expected_val,
unsigned int val) __CPU__ __HC__;
7166 #if __KALMAR_ACCELERATOR__ == 1 7167 extern "C" unsigned int atomic_add_unsigned(
unsigned int *p,
unsigned int val) __HC__;
7168 extern "C" int atomic_add_int(
int *p,
int val) __HC__;
7169 extern "C" float atomic_add_float(
float *p,
float val) __HC__;
7170 extern "C" uint64_t atomic_add_uint64(uint64_t *p, uint64_t val) __HC__;
7172 static inline unsigned int atomic_fetch_add(
unsigned int *x,
unsigned int y) __CPU__ __HC__ {
7173 return atomic_add_unsigned(x, y);
7176 return atomic_add_int(x, y);
7179 return atomic_add_float(x, y);
7181 static inline uint64_t
atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7182 return atomic_add_uint64(x, y);
7185 extern "C" unsigned int atomic_sub_unsigned(
unsigned int *p,
unsigned int val) __HC__;
7186 extern "C" int atomic_sub_int(
int *p,
int val) __HC__;
7187 extern "C" float atomic_sub_float(
float *p,
float val) __HC__;
7189 static inline unsigned int atomic_fetch_sub(
unsigned int *x,
unsigned int y) __CPU__ __HC__ {
7190 return atomic_sub_unsigned(x, y);
7193 return atomic_sub_int(x, y);
7196 return atomic_sub_float(x, y);
7199 extern "C" unsigned int atomic_and_unsigned(
unsigned int *p,
unsigned int val) __HC__;
7200 extern "C" int atomic_and_int(
int *p,
int val) __HC__;
7201 extern "C" uint64_t atomic_and_uint64(uint64_t *p, uint64_t val) __HC__;
7203 static inline unsigned int atomic_fetch_and(
unsigned int *x,
unsigned int y) __CPU__ __HC__ {
7204 return atomic_and_unsigned(x, y);
7207 return atomic_and_int(x, y);
7209 static inline uint64_t
atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7210 return atomic_and_uint64(x, y);
7213 extern "C" unsigned int atomic_or_unsigned(
unsigned int *p,
unsigned int val) __HC__;
7214 extern "C" int atomic_or_int(
int *p,
int val) __HC__;
7215 extern "C" uint64_t atomic_or_uint64(uint64_t *p, uint64_t val) __HC__;
7217 static inline unsigned int atomic_fetch_or(
unsigned int *x,
unsigned int y) __CPU__ __HC__ {
7218 return atomic_or_unsigned(x, y);
7221 return atomic_or_int(x, y);
7223 static inline uint64_t
atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7224 return atomic_or_uint64(x, y);
7227 extern "C" unsigned int atomic_xor_unsigned(
unsigned int *p,
unsigned int val) __HC__;
7228 extern "C" int atomic_xor_int(
int *p,
int val) __HC__;
7229 extern "C" uint64_t atomic_xor_uint64(uint64_t *p, uint64_t val) __HC__;
7231 static inline unsigned int atomic_fetch_xor(
unsigned int *x,
unsigned int y) __CPU__ __HC__ {
7232 return atomic_xor_unsigned(x, y);
7235 return atomic_xor_int(x, y);
7237 static inline uint64_t
atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7238 return atomic_xor_uint64(x, y);
7240 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7241 unsigned int atomic_add_unsigned(
unsigned int *p,
unsigned int val);
7242 int atomic_add_int(
int *p,
int val);
7243 float atomic_add_float(
float *p,
float val);
7244 uint64_t atomic_add_uint64(uint64_t *p, uint64_t val);
7246 static inline unsigned int atomic_fetch_add(
unsigned int *x,
unsigned int y) __CPU__ __HC__ {
7247 return atomic_add_unsigned(x, y);
7250 return atomic_add_int(x, y);
7253 return atomic_add_float(x, y);
7255 static inline uint64_t
atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7256 return atomic_add_uint64(x, y);
7259 unsigned int atomic_sub_unsigned(
unsigned int *p,
unsigned int val);
7260 int atomic_sub_int(
int *p,
int val);
7261 float atomic_sub_float(
float *p,
float val);
7263 static inline unsigned int atomic_fetch_sub(
unsigned int *x,
unsigned int y) __CPU__ __HC__ {
7264 return atomic_sub_unsigned(x, y);
7267 return atomic_sub_int(x, y);
7270 return atomic_sub_float(x, y);
7273 unsigned int atomic_and_unsigned(
unsigned int *p,
unsigned int val);
7274 int atomic_and_int(
int *p,
int val);
7275 uint64_t atomic_and_uint64(uint64_t *p, uint64_t val);
7277 static inline unsigned int atomic_fetch_and(
unsigned int *x,
unsigned int y) __CPU__ __HC__ {
7278 return atomic_and_unsigned(x, y);
7281 return atomic_and_int(x, y);
7283 static inline uint64_t
atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7284 return atomic_and_uint64(x, y);
7287 unsigned int atomic_or_unsigned(
unsigned int *p,
unsigned int val);
7288 int atomic_or_int(
int *p,
int val);
7289 uint64_t atomic_or_uint64(uint64_t *p, uint64_t val);
7291 static inline unsigned int atomic_fetch_or(
unsigned int *x,
unsigned int y) __CPU__ __HC__ {
7292 return atomic_or_unsigned(x, y);
7295 return atomic_or_int(x, y);
7297 static inline uint64_t
atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7298 return atomic_or_uint64(x, y);
7301 unsigned int atomic_xor_unsigned(
unsigned int *p,
unsigned int val);
7302 int atomic_xor_int(
int *p,
int val);
7303 uint64_t atomic_xor_uint64(uint64_t *p, uint64_t val);
7305 static inline unsigned int atomic_fetch_xor(
unsigned int *x,
unsigned int y) __CPU__ __HC__ {
7306 return atomic_xor_unsigned(x, y);
7309 return atomic_xor_int(x, y);
7311 static inline uint64_t
atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__ {
7312 return atomic_xor_uint64(x, y);
7328 extern unsigned atomic_fetch_or(
unsigned *x,
unsigned y) __CPU__ __HC__;
7330 extern uint64_t
atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__;
7337 #if __KALMAR_ACCELERATOR__ == 1 7338 extern "C" unsigned int atomic_max_unsigned(
unsigned int *p,
unsigned int val) __HC__;
7339 extern "C" int atomic_max_int(
int *p,
int val) __HC__;
7340 extern "C" uint64_t atomic_max_uint64(uint64_t *p, uint64_t val) __HC__;
7342 static inline unsigned int atomic_fetch_max(
unsigned int *x,
unsigned int y) __HC__ {
7343 return atomic_max_unsigned(x, y);
7346 return atomic_max_int(x, y);
7349 return atomic_max_uint64(x, y);
7352 extern "C" unsigned int atomic_min_unsigned(
unsigned int *p,
unsigned int val) __HC__;
7353 extern "C" int atomic_min_int(
int *p,
int val) __HC__;
7354 extern "C" uint64_t atomic_min_uint64(uint64_t *p, uint64_t val) __HC__;
7356 static inline unsigned int atomic_fetch_min(
unsigned int *x,
unsigned int y) __HC__ {
7357 return atomic_min_unsigned(x, y);
7360 return atomic_min_int(x, y);
7363 return atomic_min_uint64(x, y);
7365 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7366 unsigned int atomic_max_unsigned(
unsigned int *p,
unsigned int val);
7367 int atomic_max_int(
int *p,
int val);
7368 uint64_t atomic_max_uint64(uint64_t *p, uint64_t val);
7370 static inline unsigned int atomic_fetch_max(
unsigned int *x,
unsigned int y) __HC__ {
7371 return atomic_max_unsigned(x, y);
7374 return atomic_max_int(x, y);
7377 return atomic_max_uint64(x, y);
7380 unsigned int atomic_min_unsigned(
unsigned int *p,
unsigned int val);
7381 int atomic_min_int(
int *p,
int val);
7382 uint64_t atomic_min_uint64(uint64_t *p, uint64_t val);
7384 static inline unsigned int atomic_fetch_min(
unsigned int *x,
unsigned int y) __HC__ {
7385 return atomic_min_unsigned(x, y);
7388 return atomic_min_int(x, y);
7391 return atomic_min_uint64(x, y);
7395 extern unsigned int atomic_fetch_max(
unsigned int * dest,
unsigned int val) __CPU__ __HC__;
7396 extern uint64_t
atomic_fetch_max(uint64_t * dest, uint64_t val) __CPU__ __HC__;
7399 extern unsigned int atomic_fetch_min(
unsigned int * dest,
unsigned int val) __CPU__ __HC__;
7400 extern uint64_t
atomic_fetch_min(uint64_t * dest, uint64_t val) __CPU__ __HC__;
7418 #if __KALMAR_ACCELERATOR__ == 1 7419 extern "C" unsigned int atomic_inc_unsigned(
unsigned int *p) __HC__;
7420 extern "C" int atomic_inc_int(
int *p) __HC__;
7422 static inline unsigned int atomic_fetch_inc(
unsigned int *x) __CPU__ __HC__ {
7423 return atomic_inc_unsigned(x);
7426 return atomic_inc_int(x);
7429 extern "C" unsigned int atomic_dec_unsigned(
unsigned int *p) __HC__;
7430 extern "C" int atomic_dec_int(
int *p) __HC__;
7432 static inline unsigned int atomic_fetch_dec(
unsigned int *x) __CPU__ __HC__ {
7433 return atomic_dec_unsigned(x);
7436 return atomic_dec_int(x);
7438 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7439 unsigned int atomic_inc_unsigned(
unsigned int *p);
7440 int atomic_inc_int(
int *p);
7442 static inline unsigned int atomic_fetch_inc(
unsigned int *x) __CPU__ __HC__ {
7443 return atomic_inc_unsigned(x);
7446 return atomic_inc_int(x);
7449 unsigned int atomic_dec_unsigned(
unsigned int *p);
7450 int atomic_dec_int(
int *p);
7452 static inline unsigned int atomic_fetch_dec(
unsigned int *x) __CPU__ __HC__ {
7453 return atomic_dec_unsigned(x);
7456 return atomic_dec_int(x);
7478 extern "C" unsigned int __atomic_wrapinc(
unsigned int* address,
unsigned int val) __HC__;
7490 extern "C" unsigned int __atomic_wrapdec(
unsigned int* address,
unsigned int val) __HC__;
7497 template <
int N,
typename Kernel>
7500 template <
typename Kernel>
7503 template <
typename Kernel>
7506 template <
typename Kernel>
7509 template <
int N,
typename Kernel>
7511 return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
7514 template <
typename Kernel>
7516 return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
7519 template <
typename Kernel>
7521 return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
7524 template <
typename Kernel>
7526 return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
7529 template <
int N,
typename Kernel,
typename _Tp>
7532 static inline void call(Kernel& k, _Tp& idx) __CPU__ __HC__ {
7534 for (i = 0; i < k.ext[N - 1]; ++i) {
7540 template <
typename Kernel,
typename _Tp>
7543 static inline void call(Kernel& k, _Tp& idx) __CPU__ __HC__ {
7544 #if __KALMAR_ACCELERATOR__ == 1 7550 template <
int N,
typename Kernel>
7555 : ext(other), k(f) {}
7556 void operator() (
index<N> idx) __CPU__ __HC__ {
7562 template <
int K,
typename Ker,
typename _Tp>
7566 #pragma clang diagnostic push 7567 #pragma clang diagnostic ignored "-Wreturn-type" 7568 #pragma clang diagnostic ignored "-Wunused-variable" 7570 template <
int N,
typename Kernel>
7573 const extent<N>& compute_domain,
const Kernel& f) __CPU__ __HC__ {
7574 #if __KALMAR_ACCELERATOR__ != 1 7575 for(
int i = 0 ; i < N ; i++)
7578 if (compute_domain[i] == 0)
7580 if (compute_domain[i] < 0)
7582 if (static_cast<size_t>(compute_domain[i]) > 4294967295L)
7585 size_t ext[3] = {
static_cast<size_t>(compute_domain[N - 1]),
7586 static_cast<size_t>(compute_domain[N - 2]),
7587 static_cast<size_t>(compute_domain[N - 3])};
7588 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7590 return launch_cpu_task_async(av.pQueue, f, compute_domain);
7599 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7600 int* foo1 =
reinterpret_cast<int*
>(&Kernel::__cxxamp_trampoline);
7607 #pragma clang diagnostic pop 7609 #pragma clang diagnostic push 7610 #pragma clang diagnostic ignored "-Wreturn-type" 7611 #pragma clang diagnostic ignored "-Wunused-variable" 7613 template <
typename Kernel>
7616 #if __KALMAR_ACCELERATOR__ != 1 7618 if (compute_domain[0] == 0)
7620 if (compute_domain[0] < 0) {
7623 if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
7625 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7627 return launch_cpu_task_async(av.pQueue, f, compute_domain);
7630 size_t ext = compute_domain[0];
7634 return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 1>(av.pQueue, &ext, NULL, f));
7635 #else //if __KALMAR_ACCELERATOR__ != 1 7638 auto foo = &Kernel::__cxxamp_trampoline;
7639 auto bar = &Kernel::operator();
7642 #pragma clang diagnostic pop 7644 #pragma clang diagnostic push 7645 #pragma clang diagnostic ignored "-Wreturn-type" 7646 #pragma clang diagnostic ignored "-Wunused-variable" 7648 template <
typename Kernel>
7651 #if __KALMAR_ACCELERATOR__ != 1 7653 if (compute_domain[0] == 0 || compute_domain[1] == 0)
7655 if (compute_domain[0] < 0 || compute_domain[1] < 0) {
7658 if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
7660 if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
7662 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7664 return launch_cpu_task_async(av.pQueue, f, compute_domain);
7667 size_t ext[2] = {
static_cast<size_t>(compute_domain[1]),
7668 static_cast<size_t>(compute_domain[0])};
7672 return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 2>(av.pQueue, ext, NULL, f));
7673 #else //if __KALMAR_ACCELERATOR__ != 1 7676 auto foo = &Kernel::__cxxamp_trampoline;
7677 auto bar = &Kernel::operator();
7680 #pragma clang diagnostic pop 7682 #pragma clang diagnostic push 7683 #pragma clang diagnostic ignored "-Wreturn-type" 7684 #pragma clang diagnostic ignored "-Wunused-variable" 7686 template <
typename Kernel>
7689 #if __KALMAR_ACCELERATOR__ != 1 7691 if (compute_domain[0] == 0 || compute_domain[1] == 0 || compute_domain[2] == 0)
7693 if (compute_domain[0] < 0 || compute_domain[1] < 0 || compute_domain[2] < 0) {
7696 if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
7698 if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
7700 if (static_cast<size_t>(compute_domain[2]) > 4294967295L)
7702 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7704 return launch_cpu_task_async(av.pQueue, f, compute_domain);
7707 size_t ext[3] = {
static_cast<size_t>(compute_domain[2]),
7708 static_cast<size_t>(compute_domain[1]),
7709 static_cast<size_t>(compute_domain[0])};
7713 return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 3>(av.pQueue, ext, NULL, f));
7714 #else //if __KALMAR_ACCELERATOR__ != 1 7717 auto foo = &Kernel::__cxxamp_trampoline;
7718 auto bar = &Kernel::operator();
7721 #pragma clang diagnostic pop 7723 #pragma clang diagnostic push 7724 #pragma clang diagnostic ignored "-Wreturn-type" 7725 #pragma clang diagnostic ignored "-Wunused-variable" 7727 template <
typename Kernel>
7730 #if __KALMAR_ACCELERATOR__ != 1 7732 if (compute_domain[0] == 0)
7734 if (compute_domain[0] < 0) {
7737 if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
7739 size_t ext = compute_domain[0];
7740 size_t tile = compute_domain.
tile_dim[0];
7741 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7743 return launch_cpu_task_async(av.pQueue, f, compute_domain);
7749 void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
7751 #else //if __KALMAR_ACCELERATOR__ != 1 7755 auto foo = &Kernel::__cxxamp_trampoline;
7756 auto bar = &Kernel::operator();
7759 #pragma clang diagnostic pop 7761 #pragma clang diagnostic push 7762 #pragma clang diagnostic ignored "-Wreturn-type" 7763 #pragma clang diagnostic ignored "-Wunused-variable" 7765 template <
typename Kernel>
7768 #if __KALMAR_ACCELERATOR__ != 1 7770 if (compute_domain[0] == 0 || compute_domain[1] == 0)
7772 if (compute_domain[0] < 0 || compute_domain[1] < 0) {
7775 if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
7777 if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
7779 size_t ext[2] = {
static_cast<size_t>(compute_domain[1]),
7780 static_cast<size_t>(compute_domain[0])};
7781 size_t tile[2] = {
static_cast<size_t>(compute_domain.
tile_dim[1]),
7782 static_cast<size_t>(compute_domain.
tile_dim[0]) };
7783 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7785 return launch_cpu_task_async(av.pQueue, f, compute_domain);
7791 void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
7793 #else //if __KALMAR_ACCELERATOR__ != 1 7797 auto foo = &Kernel::__cxxamp_trampoline;
7798 auto bar = &Kernel::operator();
7801 #pragma clang diagnostic pop 7803 #pragma clang diagnostic push 7804 #pragma clang diagnostic ignored "-Wreturn-type" 7805 #pragma clang diagnostic ignored "-Wunused-variable" 7807 template <
typename Kernel>
7810 #if __KALMAR_ACCELERATOR__ != 1 7812 if (compute_domain[0] == 0 || compute_domain[1] == 0 || compute_domain[2] == 0)
7814 if (compute_domain[0] < 0 || compute_domain[1] < 0 || compute_domain[2] < 0) {
7817 if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
7819 if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
7821 if (static_cast<size_t>(compute_domain[2]) > 4294967295L)
7823 size_t ext[3] = {
static_cast<size_t>(compute_domain[2]),
7824 static_cast<size_t>(compute_domain[1]),
7825 static_cast<size_t>(compute_domain[0])};
7826 size_t tile[3] = {
static_cast<size_t>(compute_domain.
tile_dim[2]),
7827 static_cast<size_t>(compute_domain.
tile_dim[1]),
7828 static_cast<size_t>(compute_domain.
tile_dim[0]) };
7829 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 7831 return launch_cpu_task_async(av.pQueue, f, compute_domain);
7837 void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
7839 #else //if __KALMAR_ACCELERATOR__ != 1 7843 auto foo = &Kernel::__cxxamp_trampoline;
7844 auto bar = &Kernel::operator();
7847 #pragma clang diagnostic pop unsigned int __sad_u32_u16x2(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Computes the sum of the absolute differences of src0 and src1 and then adds src2 to the result...
uint64_t __pack_u8x8_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
The partial specialization array_view<const T,N> represents a view over elements of type const T with...
Definition: hc.hpp:5762
array_view(const extent< N > &ext)
Constructs an array_view which is not bound to a data source.
Definition: hc.hpp:5211
bool get_is_debug() const
Returns a boolean value indicating whether the accelerator_view supports debugging through extensive ...
Definition: hc.hpp:197
This class is the return type of all asynchronous APIs and has an interface analogous to std::shared_...
Definition: hc.hpp:1130
array(int e0, int e1, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4387
uint64_t __unpackhi_u8x8(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
void * get_hsa_agent()
Returns an opaque handle which points to the underlying HSA agent.
Definition: hc.hpp:478
void flush()
Sends the queued up commands in the accelerator_view to the device for execution. ...
Definition: hc.hpp:241
array_view(const extent< N > &ext, value_type *src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: hc.hpp:5194
extent(const extent &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:1604
array_view< const T, N > section(const extent< N > &ext) const __CPU__ __HC__
Equivalent to "section(index<N>(), ext)".
Definition: hc.hpp:6205
int64_t __unpacklo_s8x8(int64_t src0, int64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
const T value_type
The element type of this array.
Definition: hc.hpp:5781
int64_t __bitextract_s64(int64_t src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits.
tiled_extent(const extent< 1 > &ext, int t0) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2087
array_view< const ElementType, N > reinterpret_as() const __CPU__ __HC__
This member function is similar to "array<T,N>::reinterpret_as", although it only supports array_view...
Definition: hc.hpp:6249
unsigned int __lerp_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Do linear interpolation and computes the unsigned 8-bit average of packed data.
const T & operator()(const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array_view that is at the location in N-dimensional ...
Definition: hc.hpp:6095
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4649
void set_dynamic_group_segment_size(unsigned int size) __CPU__
Set the size of dynamic group segment.
Definition: hc.hpp:2200
const index< 2 > tile_dim
An index of rank 1, 2, 3 that represents the size of the tile.
Definition: hc.hpp:3689
float __amdgcn_wave_sl1(float src, bool bound_ctrl)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2943
array & operator=(array &&other)
Moves the contents of the array "other" to this array.
Definition: hc.hpp:4705
array(int e0, int e1)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:4276
array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4643
bool get_is_auto_selection()
Returns a boolean value indicating whether the accelerator view when passed to a parallel_for_each wo...
Definition: hc.hpp:171
void * get_hsa_am_finegrained_system_region()
Returns an opaque handle which points to the AM system region on the HSA agent.
Definition: hc.hpp:515
array_view(int e0, Container &src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5225
tiled_extent(const extent< 2 > &ext, int t0, int t1, int size) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2192
const index< 3 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: hc.hpp:3468
bool operator==(const extent &other) const __CPU__ __HC__
Compares two objects of extent<N>.
Definition: hc.hpp:1738
tiled_index(const tiled_index &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:3456
std::vector< accelerator > get_peers() const
Return a std::vector of this accelerator's peers.
Definition: hc.hpp:1070
array(const array_view< const T, N > &src, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array initialized with the contents of the array_view "src".
Definition: hc.hpp:4500
Represents an extent subdivided into tiles.
Definition: hc.hpp:2218
uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
void * get_hsa_am_system_region()
Returns an opaque handle which points to the AM system region on the HSA agent.
Definition: hc.hpp:503
array_view & operator=(const array_view &other) __CPU__ __HC__
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: hc.hpp:5299
array(const extent< N > &ext, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array with the supplied extent, located on the accelerator bound to the accelerator_...
Definition: hc.hpp:4374
unsigned int get_dynamic_group_segment_size() const __CPU__
Return the size of dynamic group segment in bytes.
Definition: hc.hpp:2309
uint64_t get_begin_tick()
Get the tick number when the underlying asynchronous operation begins.
Definition: hc.hpp:1316
size_t get_dedicated_memory() const
Returns the amount of dedicated memory (in KB) on an accelerator device.
Definition: hc.hpp:907
tiled_extent(const tiled_extent &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:2018
float __shfl_up(float var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__
Copy from an active work-item with lower ID relative to caller within a wavefront.
Definition: hc.hpp:3062
array_view< const T, N > section(const index< N > &idx, const extent< N > &ext) const __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:6187
float __unpack_f32_f32x2(double src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
completion_future synchronize_async() const
An asynchronous version of synchronize, which returns a completion future object. ...
Definition: hc.hpp:5425
unsigned int __unpack_u32_u32x2(uint64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
int __unpack_s32_s16x4(int64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
tiled_extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:2141
tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2, int size) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2264
int __unpacklo_s16x2(int src0, int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
bool operator!=(const accelerator &other) const
Compares "this" accelerator with the passed accelerator object to determine if they represent differe...
Definition: hc.hpp:847
array & operator=(const array &other)
Assigns the contents of the array "other" to this array, using a deep copy.
Definition: hc.hpp:4690
uint64_t get_tick_frequency()
Get the frequency of ticks per second for the underlying asynchrnous operation.
Definition: hc.hpp:103
accelerator_view get_associated_accelerator_view() const
This property returns the accelerator_view representing the preferred target where this array can be ...
Definition: hc.hpp:4675
accelerator_view get_source_accelerator_view() const
Access the accelerator_view where the data source of the array_view is located.
Definition: hc.hpp:5908
unsigned int __lastbit_u32_u32(unsigned int input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit.
Definition: hc.hpp:2535
float __amdgcn_wave_rl1(float src)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2982
void wait() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3330
The tile_barrier class is a capability class that is only creatable by the system, and passed to a tiled parallel_for_each function object as part of the tiled_index parameter.
Definition: hc.hpp:3294
index< N > operator/(const index< N > &idx, int value)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:559
array_view< T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:5649
accelerator()
Constructs a new accelerator object that represents the default accelerator.
Definition: hc.hpp:713
array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: hc.hpp:4572
The array_view<T,N> type represents a possibly cached view into the data held in an array<T...
Definition: hc.hpp:60
unsigned int __firstbit_u32_u32(unsigned int input) __HC__
Count leading zero bits in the input.
Definition: hc.hpp:2477
extent & operator*=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1763
int tile_dim[3]
Tile size for each dimension.
Definition: hc.hpp:2231
tiled_index(const tiled_index &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:3655
array(const extent< N > &ext, InputIter srcBegin)
Constructs a new array with the supplied extent, located on the default accelerator, initialized with the contents of a source container specified by a beginning and optional ending iterator.
Definition: hc.hpp:4298
void * get_hsa_agent() const
Returns an opaque handle which points to the underlying HSA agent.
Definition: hc.hpp:1051
uint64_t __pack_u16x4_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4523
bool operator!=(const extent &other) const __CPU__ __HC__
Compares two objects of extent<N>.
Definition: hc.hpp:1741
unsigned int __bitrev_b32(unsigned int src0)[[hc]] __asm("llvm.bitreverse.i32")
Reverse the bits.
void global_memory_fence(const tile_barrier &) __HC__
Establishes a thread-tile scoped memory fence for global (but not tile-static) memory operations...
int64_t __unpackhi_s16x4(int64_t src0, int64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
array_view< T, 2 > section(int i0, int i1, int e0, int e1) const __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:4997
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4326
void * get_hsa_am_finegrained_system_region() const
Returns an opaque handle which points to the AM system region on the HSA agent.
Definition: hc.hpp:1001
T * accelerator_pointer() const __CPU__ __HC__
Returns a pointer to the device memory underlying this array_view.
Definition: hc.hpp:5367
const index< 1 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: hc.hpp:3581
Represents an extent subdivided into tiles.
Definition: hc.hpp:2031
bool get_is_empty()
Returns true if the accelerator_view is currently empty.
Definition: hc.hpp:458
tiled_extent(const tiled_extent< 2 > &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:2173
void synchronize_to(const accelerator_view &av) const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:6056
array_view & operator=(const array_view< T, N > &other) __CPU__ __HC__
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: hc.hpp:5919
void copy_to(const array_view< T, N > &dest) const
Copies the contents of this array to the array_view given by "dest", as if by calling "copy(*this...
Definition: hc.hpp:4753
extent & operator*=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: hc.hpp:1824
const index< 2 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: hc.hpp:3679
extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:1594
array_view(const extent< N > &extent, Container &src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: hc.hpp:5180
Represents a unique position in N-dimensional space.
Definition: hc.hpp:58
uint64_t atomic_fetch_min(uint64_t *dest, uint64_t val) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
void * get_hsa_kernarg_region()
Returns an opaque handle which points to the Kernarg region on the HSA agent.
Definition: hc.hpp:526
extent & operator+=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1755
accelerator & operator=(const accelerator &other)
Assigns an accelerator object to "this" accelerator object and returns a reference to "this" object...
Definition: hc.hpp:802
tiled_extent(const extent< 3 > &ext, int t0, int t1, int t2, int size) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2294
array_view< T, N > section(const index< N > &origin, const extent< N > &ext) __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:4930
tiled_extent(const tiled_extent< 1 > &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:2078
int __amdgcn_mbcnt_lo(int mask, int src)[[hc]] __asm("llvm.amdgcn.mbcnt.lo")
Direct copy from indexed active work-item within a wavefront.
void synchronize() const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:6028
unsigned int get_version() const
Returns a 32-bit unsigned integer representing the version number of this accelerator.
Definition: hc.hpp:890
int __mul24(int x, int y)[[hc]]
Multiply two integers (x,y) but only the lower 24 bits will be used in the multiplication.
Definition: hc.hpp:3184
bool get_is_peer(const accelerator &other) const
Check if other is peer of this accelerator.
Definition: hc.hpp:1061
T & operator()(int i0, int i1) __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:4852
int64_t __unpacklo_s32x2(int64_t src0, int64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
float __shfl(float var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:3014
array_view< const T, K > view_as(extent< K > viewExtent) const __CPU__ __HC__
This member function is similar to "array<T,N>::view_as", although it only supports array_views of ra...
Definition: hc.hpp:6272
void wait_with_tile_static_memory_fence() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3382
Definition: kalmar_exception.h:51
const index< 2 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: hc.hpp:3661
const index< 3 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: hc.hpp:3474
array_projection_helper< T, N >::result_type operator()(int i0) __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: hc.hpp:4898
unsigned int atomic_fetch_inc(unsigned int *_Dest) __CPU__ __HC__
Atomically increment or decrement the value stored at the location point to by dest.
static bool set_default(const std::wstring &path)
Sets the default accelerator to the device path identified by the "path" argument.
Definition: hc.hpp:770
uint64_t __clock_u64() __HC__
Get system timestamp.
unsigned int __popcount_u32_b32(unsigned int input) __HC__
Count number of 1 bits in the input.
Definition: hc.hpp:2389
void wait_with_all_memory_fence() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3347
uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits.
array_view< const T, N > section(const index< N > &origin, const extent< N > &ext) const __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:4938
array_view< const T, N > section(const index< N > &idx) const __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)".
Definition: hc.hpp:6196
void * get_hsa_kernarg_region() const
Returns an opaque handle which points to the Kernarg region on the HSA agent.
Definition: hc.hpp:1012
void * get_group_segment_base_pointer() __HC__
Fetch the address of the beginning of group segment.
unsigned int get_cu_count() const
Return the compute unit count of the accelerator.
Definition: hc.hpp:1087
array_view< T, N > section(const extent< N > &ext) const __CPU__ __HC__
Equivalent to "section(index<N>(), ext)".
Definition: hc.hpp:5624
unsigned int __sad_u32_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Computes the sum of the absolute differences of src0 and src1 and then adds src2 to the result...
void * get_hsa_am_region() const
Returns an opaque handle which points to the AM region on the HSA agent.
Definition: hc.hpp:977
array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4520
extent & operator+=(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: hc.hpp:1796
void copy_to(array< T, N > &dest) const
Copies the data referred to by this array_view to the array given by "dest", as if by calling "copy(*...
Definition: hc.hpp:5317
T & operator()(const index< N > &idx) __CPU__ __HC__
Returns a reference to the element of this array that is at the location in N-dimensional space speci...
Definition: hc.hpp:4812
namespace for internal classes of Kalmar compiler / runtime
Definition: hc.hpp:42
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: hc.hpp:3586
array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4652
void wait_with_global_memory_fence() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3364
float __amdgcn_ds_swizzle(float src, int pattern)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2896
const index< 3 > tile_dim
An index of rank 1, 2, 3 that represents the size of the tile.
Definition: hc.hpp:3490
T * accelerator_pointer() const __CPU__ __HC__
Returns a pointer to the device memory underlying this array_view.
Definition: hc.hpp:5988
array(int e0, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4385
unsigned int __sad_u32_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Computes the sum of the absolute differences of src0 and src1 and then adds src2 to the result...
int64_t __unpackhi_s8x8(int64_t src0, int64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
std::wstring get_device_path() const
Returns a system-wide unique device instance path that matches the "Device Instance Path" property fo...
Definition: hc.hpp:877
unsigned int size() const __CPU__ __HC__
This member function returns the total linear size of this extent<N> (in units of elements)...
Definition: hc.hpp:1695
void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, const void *args, size_t argsize, hc::completion_future *cf=nullptr, const char *kernel_name=nullptr)
Dispatch a kernel into the accelerator_view.
Definition: hc.hpp:597
array_view(const extent< N > &ext, const value_type *src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: hc.hpp:5830
unsigned int __unpacklo_u16x2(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
array(int e0, int e1, int e2)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:4278
unsigned int __lastbit_u32_u64(unsigned long long int input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit.
Definition: hc.hpp:2539
void * get_dynamic_group_segment_base_pointer() __HC__
Fetch the address of the beginning of dynamic group segment.
void discard_data() const
Indicates to the runtime that it may discard the current logical contents of this array_view...
Definition: hc.hpp:5495
queuing_mode get_queuing_mode() const
Returns the queuing mode that this accelerator_view was created with.
Definition: hc.hpp:152
completion_future synchronize_async() const
An asynchronous version of synchronize, which returns a completion future object. ...
Definition: hc.hpp:6040
tiled_extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:2237
const index< 1 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: hc.hpp:3563
uint64_t __bitselect_b64(uint64_t src0, uint64_t src1, uint64_t src2) __HC__
Do bit field selection.
double __pack_f32x2_f32(double src0, float src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
array_view< T, N > section(const index< N > &idx) const __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)".
Definition: hc.hpp:5615
accelerator_view & operator=(const accelerator_view &other)
Assigns an accelerator_view object to "this" accelerator_view object and returns a reference to "this...
Definition: hc.hpp:141
array_view(const array_view &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:5891
int value_type
The element type of extent<N>.
Definition: hc.hpp:1588
int tile_dim[2]
Tile size for each dimension.
Definition: hc.hpp:2135
array(int e0)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:4274
void refresh() const
Calling this member function informs the array_view that its bound memory has been modified outside t...
Definition: hc.hpp:5997
uint64_t atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__
Atomically read the value stored in dest , replace it with the value given in val and return the old ...
bool has_cpu_accessible_am()
Return true if the accelerator's memory can be mapped into the CPU's address space, and the CPU is allowed to access the memory directly with CPU memory operations.
Definition: hc.hpp:1106
array_view< const T, N > section(const index< N > &idx) const __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)".
Definition: hc.hpp:4957
accelerator get_accelerator() const
Returns the accelerator that this accelerator_view has been created on.
Definition: hc.hpp:1461
extent< N > get_extent() const __CPU__ __HC__
Access the extent that defines the shape of this array.
Definition: hc.hpp:4663
extent & operator%=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: hc.hpp:1832
tiled_extent(int e0, int e1, int t0, int t1) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2152
tiled_extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:2009
Definition: kalmar_exception.h:22
uint64_t atomic_fetch_max(uint64_t *dest, uint64_t val) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
int __amdgcn_mbcnt_hi(int mask, int src)[[hc]] __asm("llvm.amdgcn.mbcnt.hi")
Direct copy from indexed active work-item within a wavefront.
accelerator_view(const accelerator_view &other)
Copy-constructs an accelerator_view object.
Definition: hc.hpp:129
extent & operator-=(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: hc.hpp:1800
array_view< const T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:6230
array_view< const T, K > view_as(const extent< K > &viewExtent) const __CPU__ __HC__
An array of higher rank can be reshaped into an array of lower rank, or vice versa, using the view_as member function.
Definition: hc.hpp:5087
accelerator_view get_source_accelerator_view() const
Access the accelerator_view where the data source of the array_view is located.
Definition: hc.hpp:5289
void wait(hcWaitMode mode=hcWaitModeBlocked) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: hc.hpp:1235
array(const array &other)
Copy constructor.
Definition: hc.hpp:4242
index< N > operator-(const index< N > &lhs, const index< N > &rhs)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:498
unsigned int get_static_group_segment_size() __HC__
Fetch the size of static group segment.
array_view< ElementType, N > reinterpret_as() const __CPU__ __HC__
This member function is similar to "array<T,N>::reinterpret_as", although it only supports array_view...
Definition: hc.hpp:5668
unsigned int get_dynamic_group_segment_size() const __CPU__
Return the size of dynamic group segment in bytes.
Definition: hc.hpp:2207
uint64_t __unpacklo_u8x8(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
hcAgentProfile get_profile() const
Returns the profile the accelerator.
Definition: hc.hpp:1029
tiled_extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:2050
void copy_to(const array_view< T, N > &dest) const
Copies the contents of this array_view to the array_view given by "dest", as if by calling "copy(*thi...
Definition: hc.hpp:5957
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: hc.hpp:3441
completion_future(const completion_future &other)
Copy constructor.
Definition: hc.hpp:1147
array_view< const T, 1 > section(int i0, int e0) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:6220
accelerator_view get_default_view() const
Returns the default accelerator_view associated with the accelerator.
Definition: hc.hpp:813
extent(const int components[]) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided the array of int component values...
Definition: hc.hpp:1636
array_view(array< T, N > &src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" array. ...
Definition: hc.hpp:5160
accelerator(const accelerator &other)
Copy constructs an accelerator object.
Definition: hc.hpp:740
array(int e0, int e1, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: hc.hpp:4570
unsigned int __unpackhi_u8x4(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4532
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: hc.hpp:3485
const T & operator[](const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array that is at the location in N-dimensional space...
Definition: hc.hpp:4829
unsigned int __bitselect_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Do bit field selection.
array(const extent< N > &ext, accelerator_view av, void *accelerator_pointer, access_type cpu_access_type=access_type_auto)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4405
void copy_to(array< T, N > &dest) const
Copies the data referred to by this array_view to the array given by "dest", as if by calling "copy(*...
Definition: hc.hpp:5948
bool contains(const index< N > &idx) const __CPU__ __HC__
Tests whether the index "idx" is properly contained within this extent (with an assumed origin of zer...
Definition: hc.hpp:1686
uint64_t __activelanemask_v4_b64_b1(unsigned int input) __HC__
Return a bit mask shows which active work-items in the wavefront have a non-zero input.
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: hc.hpp:3684
unsigned int __unpack_u32_u8x8(uint64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: hc.hpp:3542
void copy(const void *src, void *dst, size_t size_bytes)
Copies size_bytes bytes from src to dst.
Definition: hc.hpp:343
void copy(const array_view< const T, 1 > &src, const array_view< T, 1 > &dest)
The contents of "src" are copied into "dest".
Definition: hc.hpp:6694
extent operator-(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: hc.hpp:1791
extent operator--(int) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: hc.hpp:1860
const index< 1 > tile_dim
An index of rank 1, 2, 3 that represents the size of the tile.
Definition: hc.hpp:3591
array_view< T, 1 > section(int i0, int e0) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:5639
bool set_default_cpu_access_type(access_type type)
Sets the default_cpu_access_type for this accelerator.
Definition: hc.hpp:867
const T * data() const __CPU__ __HC__
Returns a pointer to the first data element underlying this array_view.
Definition: hc.hpp:5974
completion_future()
Default constructor.
Definition: hc.hpp:1138
const T & operator()(const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array that is at the location in N-dimensional space...
Definition: hc.hpp:4838
std::wstring get_description() const
Returns a short textual description of the accelerator device.
Definition: hc.hpp:882
Definition: kalmar_exception.h:42
T value_type
The element type of this array.
Definition: hc.hpp:5145
T & operator[](const index< N > &idx) __CPU__ __HC__
Returns a reference to the element of this array that is at the location in N-dimensional space speci...
Definition: hc.hpp:4803
unsigned int __bitmask_b32(unsigned int src0, unsigned int src1) __HC__
Create a bit mask that can be used with bitselect.
unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits.
array_view< T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:5005
array_view(int e0, const value_type *src) __CPU__ __HC__
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5866
void * get_native_handle() const
Get the native handle for the asynchronous operation encapsulated in this completion_future object...
Definition: hc.hpp:1302
unsigned int __packcvt_u8x4_f32(float src0, float src1, float src2, float src3) __HC__
Takes four floating-point number, convers them to unsigned integer values, and packs them into a pack...
Definition: kalmar_math.h:691
array_view(const array_view< nc_T, N > &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:5881
array_view(int e0, Container &src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5848
array(const extent< N > &ext)
Constructs a new array with the supplied extent, located on the default view of the default accelerat...
Definition: hc.hpp:4264
int __mad24(int x, int y, int z)[[hc]]
Multiply two integers (x,y) but only the lower 24 bits will be used in the multiplication and then ad...
Definition: hc.hpp:3210
float __unpackcvt_f32_u8x4(unsigned int src0, unsigned int src1) __HC__
Unpacks a single element from a packed u8x4 value and converts it to an f32.
array(int e0, int e1, int e2, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4389
float __amdgcn_ds_permute(int index, float src)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2880
unsigned int __popcount_u32_b64(unsigned long long int input) __HC__
Count number of 1 bits in the input.
Definition: hc.hpp:2399
array(int e0, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4317
array(const array_view< const T, N > &src, accelerator_view av, accelerator_view associated_av)
Constructs a staging array initialized with the array_view given by "src", which acts as a staging ar...
Definition: hc.hpp:4621
Represents an extent subdivided into tiles.
Definition: hc.hpp:2122
extent & operator+=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: hc.hpp:1816
void then(const functor &func)
This method enables specification of a completion callback func which is executed upon completion of ...
Definition: hc.hpp:1279
tiled_extent(const extent< 3 > &ext, int t0, int t1, int t2) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2283
array(int e0, int e1, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: hc.hpp:4425
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4526
int64_t __unpacklo_s16x4(int64_t src0, int64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
Represents a logical (isolated) accelerator view of a compute accelerator.
Definition: hc.hpp:120
int tile_dim[1]
Tile size for each dimension.
Definition: hc.hpp:2044
Represents an N-dimensional region of memory (with type T) located on an accelerator.
Definition: hc.hpp:61
const T & operator()(int i0, int i1) const __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: hc.hpp:4869
tiled_extent(const extent< 1 > &ext, int t0, int size) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2096
uint64_t get_tick_frequency()
Get the frequency of ticks per second for the underlying asynchrnous operation.
Definition: hc.hpp:1344
access_type get_default_cpu_access_type() const
Get the default cpu access_type for buffers created on this accelerator.
Definition: hc.hpp:946
unsigned int __unpackhi_u16x2(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4646
uint64_t get_system_ticks()
Get the current tick count for the GPU platform.
Definition: hc.hpp:93
bool operator==(const accelerator_view &other) const
Compares "this" accelerator_view with the passed accelerator_view object to determine if they represe...
Definition: hc.hpp:419
array_view< T, K > view_as(const extent< K > &viewExtent) __CPU__ __HC__
An array of higher rank can be reshaped into an array of lower rank, or vice versa, using the view_as member function.
Definition: hc.hpp:5078
extent< N > get_extent() const __CPU__ __HC__
Access the extent that defines the shape of this array_view.
Definition: hc.hpp:5897
Represents an extent subdivided into tiles.
Definition: hc.hpp:59
array_view< T, N > section(const index< N > &idx) __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)".
Definition: hc.hpp:4949
uint64_t get_end_tick()
Get the tick number when the underlying asynchronous operation ends.
Definition: hc.hpp:1330
array(int e0, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4320
void * get_hsa_am_region()
Returns an opaque handle which points to the AM region on the HSA agent.
Definition: hc.hpp:490
execute_order get_execute_order() const
Returns the execution order of this accelerator_view.
Definition: hc.hpp:157
extent(int e0) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided by .
Definition: hc.hpp:1616
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4655
completion_future(completion_future &&other)
Move constructor.
Definition: hc.hpp:1159
bool is_hsa_accelerator() const
Returns if the accelerator is based on HSA.
Definition: hc.hpp:1019
unsigned int __unpack_u32_u8x4(unsigned int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
int __bitextract_s32(int src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits.
bool get_supports_limited_double_precision() const
Returns a boolean value indicating whether the accelerator has limited double precision support (excl...
Definition: hc.hpp:922
const index< 1 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: hc.hpp:3575
array(const extent< N > &ext, InputIter srcBegin, InputIter srcEnd)
Constructs a new array with the supplied extent, located on the default accelerator, initialized with the contents of a source container specified by a beginning and optional ending iterator.
Definition: hc.hpp:4301
uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
extent(_Tp...__t) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided by .
Definition: hc.hpp:1620
array_view(int e0)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:5258
int __unpacklo_s8x4(int src0, int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2250
accelerator_view create_view(execute_order order=execute_in_order, queuing_mode mode=queuing_mode_automatic)
Creates and returns a new accelerator view on the accelerator with the supplied queuing mode...
Definition: hc.hpp:823
array_view< const T, 1 > section(int i0, int e0) const __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:4993
void refresh() const
Calling this member function informs the array_view that its bound memory has been modified outside t...
Definition: hc.hpp:5376
int __unpack_s32_s8x4(int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
array_projection_helper< T, N >::const_result_type operator()(int i0) const __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: hc.hpp:4906
const index< 1 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: hc.hpp:3569
int get_pending_async_ops()
Returns the number of pending asynchronous operations on this accelerator view.
Definition: hc.hpp:447
int get_use_count() const
Definition: hc.hpp:1381
bool get_has_display() const
This property indicates that the accelerator may be shared by (and thus have interference from) the o...
Definition: hc.hpp:900
unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits.
Definition: hc.hpp:2409
tiled_extent(int e0, int t0) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2059
index< N > operator+(const index< N > &lhs, const index< N > &rhs)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:492
extent(int components[]) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided the array of int component values...
Definition: hc.hpp:1647
std::future_status wait_for(const std::chrono::duration< _Rep, _Period > &_Rel_time) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: hc.hpp:1248
unsigned __pack_u16x2_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
array_view< const T, 2 > section(int i0, int i1, int e0, int e1) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:6225
void wait(hcWaitMode waitMode=hcWaitModeBlocked)
Performs a blocking wait for completion of all commands submitted to the accelerator view prior to ca...
Definition: hc.hpp:208
unsigned int __unpacklo_u8x4(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
accelerator_view get_accelerator_view() const
This property returns the accelerator_view representing the location where this array has been alloca...
Definition: hc.hpp:4669
unsigned int __atomic_wrapinc(unsigned int *address, unsigned int val) __HC__
Atomically do the following operations:
void synchronize_to(const accelerator_view &av) const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:5464
array(const extent< N > &ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator_vi...
Definition: hc.hpp:4596
Represents a unique position in N-dimensional space.
Definition: kalmar_index.h:226
array(const extent< N > &ext, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator vi...
Definition: hc.hpp:4549
uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits.
array_view< const T, N > section(const extent< N > &ext) const __CPU__ __HC__
Equivalent to "section(index<N>(), ext)".
Definition: hc.hpp:4972
int __unpack_s32_s8x8(int64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
size_t get_max_tile_static_size()
Returns the maximum size of tile static area available on this accelerator.
Definition: hc.hpp:953
bool operator!=(const accelerator_view &other) const
Compares "this" accelerator_view with the passed accelerator_view object to determine if they represe...
Definition: hc.hpp:431
array_view(int e0, value_type *src) __CPU__ __HC__
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5244
extent & operator/=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1767
const index< 2 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: hc.hpp:3673
void copy_to(array &dest) const
Copies the contents of this array to the array given by "dest", as if by calling "copy(*this, dest)".
Definition: hc.hpp:4735
extent & operator/=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: hc.hpp:1828
tiled_extent(int e0, int e1, int t0, int t1, int size) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2164
array & operator=(const array_view< T, N > &src)
Assigns the contents of the array_view "src", as if by calling "copy(src, *this)".
Definition: hc.hpp:4722
int __any(int predicate) __HC__
Evaluate predicate for all active work-items in the wavefront and return non-zero if and only if pred...
Definition: hc.hpp:2781
tiled_extent(const extent< 2 > &ext, int t0, int t1) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2182
int64_t __pack_s32x2_s32(int64_t src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
unsigned int __bytealign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Align 32 bits within 64 bis of data on an arbitrary byte boundary.
extent & operator%=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1771
unsigned int __lastbit_u32_s32(int input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit.
Definition: hc.hpp:2543
const T & operator()(int i0, int i1, int i2) const __CPU__ __HC__
Equivalent to "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:6132
unsigned int __firstbit_u32_s32(int input) __HC__
Count leading zero bits in the input.
Definition: hc.hpp:2502
bool is_ready()
Get if the async operations has been completed.
Definition: hc.hpp:1357
access_type get_cpu_access_type() const
This property returns the CPU "access_type" allowed for this array.
Definition: hc.hpp:4680
T & operator()(const index< N > &idx) const __CPU__ __HC__
Returns a reference to the element of this array_view that is at the location in N-dimensional space ...
Definition: hc.hpp:5517
tiled_extent(const tiled_extent< 3 > &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:2273
uint64_t __unpackhi_u32x2(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4332
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: hc.hpp:3640
uint64_t __cycle_u64() __HC__
Get hardware cycle count.
unsigned int get_group_segment_size() __HC__
Fetch the size of group segment.
float __amdgcn_wave_rr1(float src)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2963
uint64_t __bitmask_b64(unsigned int src0, unsigned int src1) __HC__
Create a bit mask that can be used with bitselect.
array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: hc.hpp:4427
uint64_t __pack_u32x2_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
extent & operator--() __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: hc.hpp:1856
completion_future & operator=(completion_future &&_Other)
Move assignment.
Definition: hc.hpp:1188
array(int e0, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: hc.hpp:4568
const T & operator[](const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array_view that is at the location in N-dimensional ...
Definition: hc.hpp:6088
unsigned int __sadhi_u16x2_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
This function is mostly the same as sad except the sum of absolute differences is added to the most s...
tiled_extent(int e0, int t0, int size) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2069
float __amdgcn_ds_bpermute(int index, float src)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2865
bool set_cu_mask(const std::vector< bool > &cu_mask)
Set a CU affinity to specific command queues.
Definition: hc.hpp:618
T * data() const __CPU__ __HC__
Returns a pointer to the raw data underlying this array.
Definition: hc.hpp:4760
T * data() const __CPU__ __HC__
Returns a pointer to the first data element underlying this array_view.
Definition: hc.hpp:5352
int __bitinsert_s32(int src0, int src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits.
uint64_t __ballot(int predicate) __HC__
Evaluate predicate for all active work-items in the wavefront and return an integer whose Nth bit is ...
Definition: hc.hpp:2800
int __pack_s8x4_s32(int src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4640
extent & operator++() __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: hc.hpp:1847
index< N > operator*(const index< N > &idx, int value)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:547
array_view< T, 2 > section(int i0, int i1, int e0, int e1) __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:5001
index< N > operator%(const index< N > &idx, int value)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:571
extent< N > get_extent() const __CPU__ __HC__
Access the extent that defines the shape of this array_view.
Definition: hc.hpp:5278
array(int e0, int e1, int e2, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4329
int64_t __unpackhi_s32x2(int64_t src0, int64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
unsigned int __activelanecount_u32_b1(unsigned int input) __HC__
Count the number of active work-items in the current wavefront that have a non-zero input...
Definition: hc.hpp:2768
array(const array_view< const T, N > &src)
Constructs a new array, located on the default view of the default accelerator, initialized with the ...
Definition: hc.hpp:4348
array(const extent< N > &ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator_vi...
Definition: hc.hpp:4593
extent & operator-=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1759
uint64_t __bitrev_b64(uint64_t src0)[[hc]] __asm("llvm.bitreverse.i64")
Reverse the bits.
bool get_supports_cpu_shared_memory() const
Returns a boolean value indicating whether the accelerator supports memory accessible both by the acc...
Definition: hc.hpp:941
uint64_t __unpacklo_u32x2(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
float __amdgcn_wave_sr1(float src, bool bound_ctrl)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2923
int __all(int predicate) __HC__
Evaluate predicate for all active work-items in the wavefront and return non-zero if and only if pred...
Definition: hc.hpp:2790
accelerator(const std::wstring &path)
Constructs a new accelerator object that represents the physical device named by the "path" argument...
Definition: hc.hpp:730
void set_dynamic_group_segment_size(unsigned int size) __CPU__
Set the size of dynamic group segment.
Definition: hc.hpp:2302
bool get_is_emulated() const
Returns a boolean value indicating whether the accelerator is emulated.
Definition: hc.hpp:935
bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__
These functions attempt to perform these three steps atomically:
completion_future & operator=(const completion_future &_Other)
Copy assignment.
Definition: hc.hpp:1170
int64_t __pack_s16x4_s32(int64_t src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
extent & operator=(const extent &other) __CPU__ __HC__
Assigns the component values of "other" to this extent<N> object.
Definition: hc.hpp:1657
int __unpack_s32_s16x2(int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
const T & operator()(int i0) const __CPU__ __HC__
Equivalent to "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:6123
array_view< T, N > section(const index< N > &idx, const extent< N > &ext) const __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:5602
array(int e0, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: hc.hpp:4423
int __unpackhi_s16x2(int src0, int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4529
Heterogeneous C++ (HC) namespace.
Definition: grid_launch.h:10
int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__
Copy from an active work-item based on bitwise XOR of caller work-item ID within a wavefront...
Definition: hc.hpp:3142
int __pack_s16x2_s32(int src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
void * get_hsa_am_system_region() const
Returns an opaque handle which points to the AM system region on the HSA agent.
Definition: hc.hpp:989
extent operator++(int) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: hc.hpp:1851
static std::vector< accelerator > get_all()
Returns a std::vector of accelerator objects (in no specific order) representing all accelerators tha...
Definition: hc.hpp:749
bool get_supports_double_precision() const
Returns a Boolean value indicating whether this accelerator supports double-precision (double) comput...
Definition: hc.hpp:914
unsigned int __unpack_u32_u16x4(uint64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
std::vector< accelerator_view > get_all_views()
Returns a vector of all accelerator_view associated with this accelerator.
Definition: hc.hpp:960
array_view< T, 1 > section(int i0, int e0) __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:4989
void copy_to(const array_view &dest) const
Copies the contents of this array_view to the array_view given by "dest", as if by calling "copy(*thi...
Definition: hc.hpp:5335
array_view< T, N > section(const extent< N > &ext) __CPU__ __HC__
Equivalent to "section(index<N>(), ext)".
Definition: hc.hpp:4968
T & operator()(int i0, int i1, int i2) __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:4855
array(const extent< N > &ext, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array with the supplied extent, located on the accelerator bound to the accelerator_...
Definition: hc.hpp:4460
array(int e0, int e1, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4323
bool get_is_debug() const
Returns a boolean value indicating whether the accelerator supports debugging.
Definition: hc.hpp:929
int __unpackhi_s8x4(int src0, int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
const T & operator()(int i0, int i1) const __CPU__ __HC__
Equivalent to "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:6128
uint64_t __unpacklo_u16x4(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.
array_view & operator=(const array_view &other) __CPU__ __HC__
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: hc.hpp:5928
void tile_static_memory_fence(const tile_barrier &) __HC__
Establishes a thread-tile scoped memory fence for tile-static (but not global) memory operations...
extent & operator-=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: hc.hpp:1820
size_t get_max_tile_static_size()
Returns the maximum size of tile static area available on this accelerator view.
Definition: hc.hpp:437
unsigned int __lastbit_u32_s64(unsigned long long input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit.
Definition: hc.hpp:2547
uint64_t __unpackhi_u16x4(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.
#define __HSA_WAVEFRONT_SIZE__
Fetch the size of a wavefront.
Definition: hc.hpp:2373
uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
int64_t __pack_s8x8_s32(int64_t src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
tile_barrier(const tile_barrier &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:3317
int64_t __bitinsert_s64(int64_t src0, int64_t src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits.
void * get_hsa_queue()
Returns an opaque handle which points to the underlying HSA queue.
Definition: hc.hpp:468
unsigned int __unpack_u32_u16x2(unsigned int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
void set_dynamic_group_segment_size(unsigned int size) __CPU__
Set the size of dynamic group segment.
Definition: hc.hpp:2104
array(const extent< N > &ext, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4392
bool valid() const
This method is functionally identical to std::shared_future<void>::valid.
Definition: hc.hpp:1213
T * accelerator_pointer() const __CPU__ __HC__
Returns a pointer to the device memory underlying this array.
Definition: hc.hpp:4775
array_view(const array_view &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:5272
array_view< const ElementType, 1 > reinterpret_as() const __CPU__ __HC__
Sometimes it is desirable to view the data of an N-dimensional array as a linear array, possibly with a (unsafe) reinterpretation of the element type.
Definition: hc.hpp:5051
unsigned int atomic_fetch_dec(unsigned int *_Dest) __CPU__ __HC__
Atomically increment or decrement the value stored at the location point to by dest.
array(array &&other)
Move constructor.
Definition: hc.hpp:4253
uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
T value_type
The element type of this array.
Definition: hc.hpp:4227
array_view< T, K > view_as(extent< K > viewExtent) const __CPU__ __HC__
This member function is similar to "array<T,N>::view_as", although it only supports array_views of ra...
Definition: hc.hpp:5693
unsigned int __activelaneid_u32() __HC__
Get the count of the number of earlier (in flattened work-item order) active work-items within the sa...
array_view< const T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:5009
unsigned int __firstbit_u32_s64(long long int input) __HC__
Count leading zero bits in the input.
Definition: hc.hpp:2520
array_view< ElementType, 1 > reinterpret_as() __CPU__ __HC__
Sometimes it is desirable to view the data of an N-dimensional array as a linear array, possibly with a (unsafe) reinterpretation of the element type.
Definition: hc.hpp:5038
const index< 2 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: hc.hpp:3667
int __amdgcn_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl)[[hc]]
move DPP intrinsic
unsigned int __bitalign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Align 32 bits within 64 bits of data on an arbitrary bit boundary.
const T & operator()(int i0, int i1, int i2) const __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: hc.hpp:4872
int __unpack_s32_s3x2(int64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.
void synchronize() const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:5412
const index< 3 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: hc.hpp:3462
const index< 3 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: hc.hpp:3480
unsigned int get_dynamic_group_segment_size() const __CPU__
Return the size of dynamic group segment in bytes.
Definition: hc.hpp:2111
array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4517
void all_memory_fence(const tile_barrier &) __HC__
Establishes a thread-tile scoped memory fence for both global and tile-static memory operations...
bool is_hsa_accelerator()
Returns if the accelerator view is based on HSA.
Definition: hc.hpp:533
array_view< T, 2 > section(int i0, int i1, int e0, int e1) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:5644
unsigned int __pack_u8x4_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
bool operator==(const accelerator &other) const
Compares "this" accelerator with the passed accelerator object to determine if they represent the sam...
Definition: hc.hpp:837
std::future_status wait_until(const std::chrono::time_point< _Clock, _Duration > &_Abs_time) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: hc.hpp:1253
Definition: kalmar_math.h:297
unsigned int __firstbit_u32_u64(unsigned long long int input) __HC__
Count leading zero bits in the input.
Definition: hc.hpp:2489
Represents a physical accelerated computing device.
Definition: hc.hpp:700
Definition: kalmar_runtime.h:14
int get_seqnum() const
Return the unique integer sequence-number for the accelerator.
Definition: hc.hpp:1095
unsigned int __atomic_wrapdec(unsigned int *address, unsigned int val) __HC__
Atomically do the following operations:
int __lane_id(void)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2845
array_view(const array< T, N > &src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" array. ...
Definition: hc.hpp:5796
float atomic_fetch_sub(float *x, float y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
tiled_index(const tiled_index &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:3557
array_view(const extent< N > &extent, const Container &src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: hc.hpp:5816
float __shfl_down(float var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__
Copy from an active work-item with higher ID relative to caller within a wavefront.
Definition: hc.hpp:3111
extent operator+(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: hc.hpp:1786
static accelerator_view get_auto_selection_view()
Returns an accelerator_view which when passed as the first argument to a parallel_for_each call cause...
Definition: hc.hpp:789