15 #include "hc_defines.h"    16 #include "kalmar_exception.h"    17 #include "kalmar_index.h"    18 #include "kalmar_runtime.h"    19 #include "kalmar_serialize.h"    20 #include "kalmar_launch.h"    21 #include "kalmar_buffer.h"    22 #include "kalmar_math.h"    24 #include "hsa_atomic.h"    25 #include "kalmar_cpu_launch.h"    26 #include "hcc_features.hpp"    29 #   define __HC__ [[hc]]    33 #   define __CPU__ [[cpu]]    36 typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t;
    56 class accelerator_view;
    57 class completion_future;
    61 template <
typename T, 
int N> 
class array;
    94     return Kalmar::getContext()->getSystemTicks();
   104     return Kalmar::getContext()->getSystemTickFrequency();
   107 #define GET_SYMBOL_ADDRESS(acc, symbol) \   108     acc.get_symbol_address( #symbol );   130         pQueue(other.pQueue) {}
   142         pQueue = other.pQueue;
   182     unsigned int get_version() 
const;
   208     void wait(hcWaitMode waitMode = hcWaitModeBlocked) { 
   209       pQueue->wait(waitMode); 
   210       Kalmar::getContext()->flushPrintfBuffer();
   313     completion_future create_blocking_marker(std::initializer_list<completion_future> dependent_future_list, memory_scope fence_scope=system_scope) 
const;
   333     template<
typename InputIterator>
   334     completion_future create_blocking_marker(InputIterator first, InputIterator last, memory_scope scope) 
const;
   343     void copy(
const void *src, 
void *dst, 
size_t size_bytes) {
   344         pQueue->copy(src, dst, size_bytes);
   364     void copy_ext(
const void *src, 
void *dst, 
size_t size_bytes, hcCommandKind copyDir, 
const hc::AmPointerInfo &srcInfo, 
const hc::AmPointerInfo &dstInfo, 
bool forceUnpinnedCopy) ;
   420         return pQueue == other.pQueue;
   438         return pQueue.get()->getDev()->GetMaxTileStaticSize();
   448         return pQueue->getPendingAsyncOps();
   459         return pQueue->isEmpty();
   469         return pQueue->getHSAQueue();
   479         return pQueue->getHSAAgent();
   491         return pQueue->getHSAAMRegion();
   504         return pQueue->getHSAAMHostRegion();
   516         return pQueue->getHSACoherentAMHostRegion();
   527         return pQueue->getHSAKernargRegion();
   534         return pQueue->hasHSAInterOp();
   598                            const void * args, 
size_t argsize,
   601         pQueue->dispatch_hsa_kernel(aql, args, argsize, cf, kernel_name);
   620         if(is_hsa_accelerator()) {
   621             return pQueue->set_cu_mask(cu_mask);
   627     accelerator_view(std::shared_ptr<Kalmar::KalmarQueue> pQueue) : pQueue(pQueue) {}
   628     std::shared_ptr<Kalmar::KalmarQueue> pQueue;
   631     template <
typename Q, 
int K> 
friend class array;
   632     template <
typename Q, 
int K> 
friend class array_view;
   634     template<
typename Kernel> 
friend   635         void* Kalmar::mcw_cxxamp_get_kernel(
const std::shared_ptr<Kalmar::KalmarQueue>&, 
const Kernel&);
   636     template<
typename Kernel, 
int dim_ext> 
friend   637         void Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory(
const std::shared_ptr<Kalmar::KalmarQueue>&, 
size_t *, 
size_t *, 
const Kernel&, 
void*, 
size_t);
   638     template<
typename Kernel, 
int dim_ext> 
friend   639         std::shared_ptr<Kalmar::KalmarAsyncOp> Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async(
const std::shared_ptr<Kalmar::KalmarQueue>&, 
size_t *, 
size_t *, 
const Kernel&, 
void*, 
size_t);
   640     template<
typename Kernel, 
int dim_ext> 
friend   641         void Kalmar::mcw_cxxamp_launch_kernel(
const std::shared_ptr<Kalmar::KalmarQueue>&, 
size_t *, 
size_t *, 
const Kernel&);
   642     template<
typename Kernel, 
int dim_ext> 
friend   643         std::shared_ptr<Kalmar::KalmarAsyncOp> Kalmar::mcw_cxxamp_launch_kernel_async(
const std::shared_ptr<Kalmar::KalmarQueue>&, 
size_t *, 
size_t *, 
const Kernel&);
   645 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2   646     template <
typename Kernel, 
int N> 
friend   652     template <
int N, 
typename Kernel> 
friend   656     template <
typename Kernel> 
friend   660     template <
typename Kernel> 
friend   664     template <
typename Kernel> 
friend   668     template <
typename Kernel> 
friend   672     template <
typename Kernel> 
friend   676     template <
typename Kernel> 
friend   680 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2   683     __attribute__((annotate(
"user_deserialize")))
   685 #if __KALMAR_ACCELERATOR__ != 1   731         : pDev(
Kalmar::getContext()->getDevice(path)) {}
   750         auto Devices = Kalmar::getContext()->getDevices();
   751         std::vector<accelerator> ret;
   752         for(
auto&& i : Devices)
   771         return Kalmar::getContext()->set_default(path);
   790         return Kalmar::getContext()->auto_select();
   824         auto pQueue = pDev->createQueue(order);
   825         pQueue->set_mode(mode);
   868         pDev->set_access(type);
   954       return get_default_view().get_max_tile_static_size();
   961         std::vector<accelerator_view> result;
   962         std::vector< std::shared_ptr<Kalmar::KalmarQueue> > queues = pDev->get_all_queues();
   963         for (
auto q : queues) {
   978         return get_default_view().get_hsa_am_region();
   990         return get_default_view().get_hsa_am_system_region();
  1002         return get_default_view().get_hsa_am_finegrained_system_region();
  1013         return get_default_view().get_hsa_kernarg_region();
  1020         return get_default_view().is_hsa_accelerator();
  1030         return pDev->getProfile();
  1033     void memcpy_symbol(
const char* symbolName, 
void* hostptr, 
size_t count, 
size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {
  1034         pDev->memcpySymbol(symbolName, hostptr, count, offset, kind);
  1037     void memcpy_symbol(
void* symbolAddr, 
void* hostptr, 
size_t count, 
size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {
  1038         pDev->memcpySymbol(symbolAddr, hostptr, count, offset, kind);
  1041     void* get_symbol_address(
const char* symbolName) {
  1042         return pDev->getSymbolAddress(symbolName);
  1052         return pDev->getHSAAgent();
  1062         return pDev->is_peer(other.pDev);
  1071         std::vector<accelerator> peers;
  1073         const auto &accs = get_all();
  1075         for(
auto iter = accs.begin(); iter != accs.end(); iter++)
  1077             if(this->get_is_peer(*iter))
  1078                 peers.push_back(*iter);
  1088         return pDev->get_compute_unit_count();
  1096         return pDev->get_seqnum();
  1107         return pDev->has_cpu_accessible_am();
  1110     Kalmar::KalmarDevice *get_dev_ptr()
 const { 
return pDev; }; 
  1113     accelerator(Kalmar::KalmarDevice* pDev) : pDev(pDev) {}
  1115     Kalmar::KalmarDevice* pDev;
  1148         : __amp_future(other.__amp_future), __thread_then(other.__thread_then), __asyncOp(other.__asyncOp) {}
  1160         : __amp_future(
std::move(other.__amp_future)), __thread_then(other.__thread_then), __asyncOp(other.__asyncOp) {}
  1171         if (
this != &_Other) {
  1172            __amp_future = _Other.__amp_future;
  1173            __thread_then = _Other.__thread_then;
  1174            __asyncOp = _Other.__asyncOp;
  1189         if (
this != &_Other) {
  1190             __amp_future = std::move(_Other.__amp_future);
  1191             __thread_then = _Other.__thread_then;
  1192            __asyncOp = _Other.__asyncOp;
  1214         return __amp_future.valid();
  1235     void wait(hcWaitMode mode = hcWaitModeBlocked)
 const {
  1236         if (this->valid()) {
  1237             if (__asyncOp != 
nullptr) {
  1238                 __asyncOp->setWaitMode(mode);
  1241             __amp_future.wait();
  1244         Kalmar::getContext()->flushPrintfBuffer();
  1247     template <
class _Rep, 
class _Period>
  1248     std::future_status 
wait_for(
const std::chrono::duration<_Rep, _Period>& _Rel_time)
 const {
  1249         return __amp_future.wait_for(_Rel_time);
  1252     template <
class _Clock, 
class _Duration>
  1253     std::future_status 
wait_until(
const std::chrono::time_point<_Clock, _Duration>& _Abs_time)
 const {
  1254         return __amp_future.wait_until(_Abs_time);
  1264     operator std::shared_future<void>() 
const {
  1265         return __amp_future;
  1278     template<
typename functor>
  1280 #if __KALMAR_ACCELERATOR__ != 1  1282       if (__thread_then == 
nullptr) {
  1284         __thread_then = 
new std::thread([&]() __CPU__ {
  1303       if (__asyncOp != 
nullptr) {
  1304         return __asyncOp->getNativeHandle();
  1317       if (__asyncOp != 
nullptr) {
  1318         return __asyncOp->getBeginTimestamp();
  1331       if (__asyncOp != 
nullptr) {
  1332         return __asyncOp->getEndTimestamp();
  1345       if (__asyncOp != 
nullptr) {
  1346         return __asyncOp->getTimestampFrequency();
  1358       if (__asyncOp != 
nullptr) {
  1359         return __asyncOp->isReady();
  1366       if (__thread_then != 
nullptr) {
  1367         __thread_then->join();
  1369       delete __thread_then;
  1370       __thread_then = 
nullptr;
  1372       if (__asyncOp != 
nullptr) {
  1373         __asyncOp = 
nullptr;
  1384     std::shared_future<void> __amp_future;
  1385     std::thread* __thread_then = 
nullptr;
  1386     std::shared_ptr<Kalmar::KalmarAsyncOp> __asyncOp;
  1388     completion_future(std::shared_ptr<Kalmar::KalmarAsyncOp> event) : __amp_future(*(event->getFuture())), __asyncOp(event) {}
  1391         : __amp_future(__future), __thread_then(
nullptr), __asyncOp(
nullptr) {}
  1393     friend class Kalmar::HSAQueue;
  1397     template <
int N, 
typename Kernel> 
friend  1401     template <
typename Kernel> 
friend  1405     template <
typename Kernel> 
friend  1409     template <
typename Kernel> 
friend  1413     template <
typename Kernel> 
friend  1417     template <
typename Kernel> 
friend  1421     template <
typename Kernel> 
friend  1425     template <
typename T, 
int N> 
friend  1427     template <
typename T, 
int N> 
friend  1429     template <
typename T, 
int N> 
friend  1431     template <
typename T, 
int N> 
friend  1433     template <
typename T, 
int N> 
friend  1436     template <
typename InputIter, 
typename T, 
int N> 
friend  1438     template <
typename InputIter, 
typename T, 
int N> 
friend  1440     template <
typename InputIter, 
typename T, 
int N> 
friend  1442     template <
typename InputIter, 
typename T, 
int N> 
friend  1444     template <
typename OutputIter, 
typename T, 
int N> 
friend  1446     template <
typename OutputIter, 
typename T, 
int N> 
friend  1450     template <
typename T, 
int N> 
friend class array_view;
  1461 accelerator_view::get_accelerator()
 const { 
return pQueue->getDev(); }
  1464 accelerator_view::create_marker(memory_scope scope)
 const {
  1465     std::shared_ptr<Kalmar::KalmarAsyncOp> deps[1]; 
  1468     std::shared_ptr<Kalmar::KalmarAsyncOp> depOp = pQueue->detectStreamDeps(hcCommandMarker, 
nullptr);
  1472         deps[cnt++] = depOp; 
  1475     return completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope));
  1478 inline unsigned int accelerator_view::get_version()
 const { 
return get_accelerator().get_version(); }
  1481     std::shared_ptr<Kalmar::KalmarAsyncOp> deps[2]; 
  1485     std::shared_ptr<Kalmar::KalmarAsyncOp> depOp = pQueue->detectStreamDeps(hcCommandMarker, 
nullptr);
  1489         deps[cnt++] = depOp; 
  1492     if (dependent_future.__asyncOp) {
  1493         deps[cnt++] = dependent_future.__asyncOp; 
  1496     return completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope));
  1499 template<
typename InputIterator>
  1501 accelerator_view::create_blocking_marker(InputIterator first, InputIterator last, memory_scope scope)
 const {
  1502     std::shared_ptr<Kalmar::KalmarAsyncOp> deps[5]; 
  1508     std::shared_ptr<Kalmar::KalmarAsyncOp> depOp = pQueue->detectStreamDeps(hcCommandMarker, 
nullptr);
  1512         deps[cnt++] = depOp; 
  1520     for (
auto iter = first; iter != last; ++iter) {
  1521         if (iter->__asyncOp) {
  1522             deps[cnt++] = iter->__asyncOp; 
  1524                 lastMarker = 
completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, hc::no_scope));
  1531         lastMarker = 
completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope));
  1538 accelerator_view::create_blocking_marker(std::initializer_list<completion_future> dependent_future_list, memory_scope scope)
 const {
  1539     return create_blocking_marker(dependent_future_list.begin(), dependent_future_list.end(), scope);
  1544     pQueue->copy_ext(src, dst, size_bytes, copyDir, srcInfo, dstInfo, copyAcc ? copyAcc->pDev : 
nullptr, forceUnpinnedCopy);
  1547 inline void accelerator_view::copy_ext(
const void *src, 
void *dst, 
size_t size_bytes, hcCommandKind copyDir, 
const hc::AmPointerInfo &srcInfo, 
const hc::AmPointerInfo &dstInfo, 
bool forceHostCopyEngine) {
  1548     pQueue->copy_ext(src, dst, size_bytes, copyDir, srcInfo, dstInfo, forceHostCopyEngine);
  1552 accelerator_view::copy_async(
const void *src, 
void *dst, 
size_t size_bytes) {
  1557 accelerator_view::copy_async_ext(
const void *src, 
void *dst, 
size_t size_bytes,
  1558                              hcCommandKind copyDir, 
  1562     return completion_future(pQueue->EnqueueAsyncCopyExt(src, dst, size_bytes, copyDir, srcInfo, dstInfo, copyAcc ? copyAcc->pDev : 
nullptr));
  1583     static const int rank = N;
  1595       static_assert(N > 0, 
"Dimensionality must be positive");
  1605         : base_(other.base_) {}
  1619     template <
typename ..._Tp>
  1622       static_assert(
sizeof...(__t) <= 3, 
"Can only supply at most 3 individual coordinates in the constructor");
  1623       static_assert(
sizeof...(__t) == N, 
"rank should be consistency");
  1636     explicit extent(
const int components[]) __CPU__ __HC__
  1637         : base_(components) {}
  1647     explicit extent(
int components[]) __CPU__ __HC__
  1648         : base_(components) {}
  1658         base_.operator=(other.base_);
  1669     int operator[] (
unsigned int c) 
const __CPU__ __HC__ {
  1672     int& operator[] (
unsigned int c) __CPU__ __HC__ {
  1687         return Kalmar::amp_helper<N, index<N>, 
extent<N>>::contains(idx, *
this);
  1695     unsigned int size() const __CPU__ __HC__ {
  1696         return Kalmar::index_helper<N, extent<N>>::count_size(*
this);
  1723     tiled_extent<2> tile_with_dynamic(
int t0, 
int t1, 
int dynamic_size) 
const;
  1724     tiled_extent<3> tile_with_dynamic(
int t0, 
int t1, 
int t2, 
int dynamic_size) 
const;
  1739         return Kalmar::index_helper<N, extent<N> >::equal(*
this, other);
  1742         return !(*
this == other);
  1756         base_.operator+=(__r.base_);
  1760         base_.operator-=(__r.base_);
  1764         base_.operator*=(__r.base_);
  1768         base_.operator/=(__r.base_);
  1772         base_.operator%=(__r.base_);
  1797         base_.operator+=(idx.base_);
  1801         base_.operator-=(idx.base_);
  1817         base_.operator+=(value);
  1821         base_.operator-=(value);
  1825         base_.operator*=(value);
  1829         base_.operator/=(value);
  1833         base_.operator%=(value);
  1848         base_.operator+=(1);
  1853         base_.operator+=(1);
  1857         base_.operator-=(1);
  1862         base_.operator-=(1);
  1869     typedef Kalmar::index_impl<typename Kalmar::__make_indices<N>::type> base;
  1871     template <
int K, 
typename Q> 
friend struct Kalmar::index_helper;
  1872     template <
int K, 
typename Q1, 
typename Q2> 
friend struct Kalmar::amp_helper;
  1998     static const int rank = N;
  2019       for (
int i = 0; i < N; ++i) {
  2020         tile_dim[i] = other.tile_dim[i];
  2036     unsigned int dynamic_group_segment_size;
  2039     static const int rank = 1;
  2069     tiled_extent(
int e0, 
int t0, 
int size) __CPU__ __HC__ : 
extent(e0), dynamic_group_segment_size(size), tile_dim{t0} {}
  2105         dynamic_group_segment_size = size;
  2112         return dynamic_group_segment_size;
  2127     unsigned int dynamic_group_segment_size;
  2130     static const int rank = 2;
  2152     tiled_extent(
int e0, 
int e1, 
int t0, 
int t1) __CPU__ __HC__ : 
extent(e0, e1), dynamic_group_segment_size(0), tile_dim{t0, t1} {}
  2164     tiled_extent(
int e0, 
int e1, 
int t0, 
int t1, 
int size) __CPU__ __HC__ : 
extent(e0, e1), dynamic_group_segment_size(size), tile_dim{t0, t1} {}
  2173     tiled_extent(
const tiled_extent<2>& other) __CPU__ __HC__ : 
extent(other[0], other[1]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0], other.tile_dim[1]} {}
  2201         dynamic_group_segment_size = size;
  2208         return dynamic_group_segment_size;
  2223     unsigned int dynamic_group_segment_size;
  2226     static const int rank = 3;
  2250     tiled_extent(
int e0, 
int e1, 
int e2, 
int t0, 
int t1, 
int t2) __CPU__ __HC__ : 
extent(e0, e1, e2), dynamic_group_segment_size(0), tile_dim{t0, t1, t2} {}
  2264     tiled_extent(
int e0, 
int e1, 
int e2, 
int t0, 
int t1, 
int t2, 
int size) __CPU__ __HC__ : 
extent(e0, e1, e2), dynamic_group_segment_size(size), tile_dim{t0, t1, t2} {}
  2273     tiled_extent(
const tiled_extent<3>& other) __CPU__ __HC__ : 
extent(other[0], other[1], other[2]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0], other.tile_dim[1], other.tile_dim[2]} {}
  2294     tiled_extent(
const extent<3>& ext, 
int t0, 
int t1, 
int t2, 
int size) __CPU__ __HC__ : 
extent(ext), dynamic_group_segment_size(size), tile_dim{t0, t1, t2} {}
  2303         dynamic_group_segment_size = size;
  2310         return dynamic_group_segment_size;
  2321   static_assert(N == 1, 
"One-dimensional tile() method only available on extent<1>");
  2328   static_assert(N == 2, 
"Two-dimensional tile() method only available on extent<2>");
  2335   static_assert(N == 3, 
"Three-dimensional tile() method only available on extent<3>");
  2346   static_assert(N == 1, 
"One-dimensional tile() method only available on extent<1>");
  2353   static_assert(N == 2, 
"Two-dimensional tile() method only available on extent<2>");
  2360   static_assert(N == 3, 
"Three-dimensional tile() method only available on extent<3>");
  2373 #define __HSA_WAVEFRONT_SIZE__ (64)  2374 extern "C" unsigned int __wavesize() __HC__; 
  2377 #if __hcc_backend__==HCC_BACKEND_AMDGPU  2378 extern "C" inline unsigned int __wavesize() __HC__ {
  2390   return __builtin_popcount(input);
  2400   return __builtin_popcountl(input);
  2409 extern "C" inline unsigned int __bitextract_u32(
unsigned int src0, 
unsigned int src1, 
unsigned int src2) __HC__ {
  2410   return (src0 << (32 - src1 - src2)) >> (32 - src2);
  2413 extern "C" uint64_t 
__bitextract_u64(uint64_t src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2415 extern "C" int __bitextract_s32(
int src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2417 extern "C" int64_t 
__bitextract_s64(int64_t src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2426 extern "C" unsigned int __bitinsert_u32(
unsigned int src0, 
unsigned int src1, 
unsigned int src2, 
unsigned int src3) __HC__;
  2428 extern "C" uint64_t 
__bitinsert_u64(uint64_t src0, uint64_t src1, 
unsigned int src2, 
unsigned int src3) __HC__;
  2430 extern "C" int __bitinsert_s32(
int src0, 
int src1, 
unsigned int src2, 
unsigned int src3) __HC__;
  2432 extern "C" int64_t 
__bitinsert_s64(int64_t src0, int64_t src1, 
unsigned int src2, 
unsigned int src3) __HC__;
  2441 extern "C" unsigned int __bitmask_b32(
unsigned int src0, 
unsigned int src1) __HC__;
  2443 extern "C" uint64_t 
__bitmask_b64(
unsigned int src0, 
unsigned int src1) __HC__;
  2453 unsigned int __bitrev_b32(
unsigned int src0) [[
hc]] __asm(
"llvm.bitreverse.i32");
  2455 uint64_t 
__bitrev_b64(uint64_t src0) [[
hc]] __asm(
"llvm.bitreverse.i64");
  2465 extern "C" unsigned int __bitselect_b32(
unsigned int src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2467 extern "C" uint64_t 
__bitselect_b64(uint64_t src0, uint64_t src1, uint64_t src2) __HC__;
  2478   return input == 0 ? -1 : __builtin_clz(input);
  2490   return input == 0 ? -1 : __builtin_clzl(input);
  2536   return input == 0 ? -1 : __builtin_ctz(input);
  2540   return input == 0 ? -1 : __builtin_ctzl(input);
  2559 extern "C" unsigned int __unpacklo_u8x4(
unsigned int src0, 
unsigned int src1) __HC__;
  2561 extern "C" uint64_t 
__unpacklo_u8x8(uint64_t src0, uint64_t src1) __HC__;
  2563 extern "C" unsigned int __unpacklo_u16x2(
unsigned int src0, 
unsigned int src1) __HC__;
  2571 extern "C" int64_t 
__unpacklo_s8x8(int64_t src0, int64_t src1) __HC__;
  2587 extern "C" unsigned int __unpackhi_u8x4(
unsigned int src0, 
unsigned int src1) __HC__;
  2589 extern "C" uint64_t 
__unpackhi_u8x8(uint64_t src0, uint64_t src1) __HC__;
  2591 extern "C" unsigned int __unpackhi_u16x2(
unsigned int src0, 
unsigned int src1) __HC__;
  2599 extern "C" int64_t 
__unpackhi_s8x8(int64_t src0, int64_t src1) __HC__;
  2615 extern "C" unsigned int __pack_u8x4_u32(
unsigned int src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2617 extern "C" uint64_t 
__pack_u8x8_u32(uint64_t src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2619 extern "C" unsigned __pack_u16x2_u32(
unsigned int src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2621 extern "C" uint64_t 
__pack_u16x4_u32(uint64_t src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2623 extern "C" uint64_t 
__pack_u32x2_u32(uint64_t src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2625 extern "C" int __pack_s8x4_s32(
int src0, 
int src1, 
unsigned int src2) __HC__;
  2627 extern "C" int64_t 
__pack_s8x8_s32(int64_t src0, 
int src1, 
unsigned int src2) __HC__;
  2629 extern "C" int __pack_s16x2_s32(
int src0, 
int src1, 
unsigned int src2) __HC__;
  2631 extern "C" int64_t 
__pack_s16x4_s32(int64_t src0, 
int src1, 
unsigned int src2) __HC__;
  2633 extern "C" int64_t 
__pack_s32x2_s32(int64_t src0, 
int src1, 
unsigned int src2) __HC__;
  2635 extern "C" double __pack_f32x2_f32(
double src0, 
float src1, 
unsigned int src2) __HC__;
  2644 extern "C" unsigned int __unpack_u32_u8x4(
unsigned int src0, 
unsigned int src1) __HC__;
  2646 extern "C" unsigned int __unpack_u32_u8x8(uint64_t src0, 
unsigned int src1) __HC__;
  2648 extern "C" unsigned int __unpack_u32_u16x2(
unsigned int src0, 
unsigned int src1) __HC__;
  2672 extern "C" unsigned int __bitalign_b32(
unsigned int src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2679 extern "C" unsigned int __bytealign_b32(
unsigned int src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2687 extern "C" unsigned int __lerp_u8x4(
unsigned int src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2695 extern "C" unsigned int __packcvt_u8x4_f32(
float src0, 
float src1, 
float src2, 
float src3) __HC__;
  2711 extern "C" unsigned int __sad_u32_u32(
unsigned int src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2713 extern "C" unsigned int __sad_u32_u16x2(
unsigned int src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2715 extern "C" unsigned int __sad_u32_u8x4(
unsigned int src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2724 extern "C" unsigned int __sadhi_u16x2_u8x4(
unsigned int src0, 
unsigned int src1, 
unsigned int src2) __HC__;
  2781 extern "C" inline int __any(
int predicate) __HC__ {
  2790 extern "C" inline int __all(
int predicate) __HC__ {
  2800 extern "C" inline uint64_t 
__ballot(
int predicate) __HC__ {
  2835 #if __hcc_backend__==HCC_BACKEND_AMDGPU  2852 #if __hcc_backend__==HCC_BACKEND_AMDGPU  2861   __u tmp; tmp.u = src;
  2866   __u tmp; tmp.f = src;
  2876   __u tmp; tmp.u = src;
  2881   __u tmp; tmp.f = src;
  2892   __u tmp; tmp.u = src;
  2897   __u tmp; tmp.f = src;
  2907 extern "C" int __amdgcn_move_dpp(
int src, 
int dpp_ctrl, 
int row_mask, 
int bank_mask, 
bool bound_ctrl) [[
hc]]; 
  2919   __u tmp; tmp.u = src;
  2924   __u tmp; tmp.f = src;
  2939   __u tmp; tmp.u = src;
  2944   __u tmp; tmp.f = src;
  2959   __u tmp; tmp.u = src;
  2964   __u tmp; tmp.f = src;
  2978   __u tmp; tmp.u = src;
  2983   __u tmp; tmp.f = src;
  2997 #if __hcc_backend__==HCC_BACKEND_AMDGPU  3001   int index = srcLane + (
self & ~(width-1));
  3008      __u tmp; tmp.u = var;
  3009     tmp.i = 
__shfl(tmp.i, srcLane, width);
  3015     __u tmp; tmp.f = var;
  3016     tmp.i = 
__shfl(tmp.i, srcLane, width);
  3045 #if __hcc_backend__==HCC_BACKEND_AMDGPU  3049   int index = 
self - delta;
  3050   index = (index < (
self & ~(width-1)))?
self:index;
  3057     __u tmp; tmp.u = var;
  3063     __u tmp; tmp.f = var;
  3094 #if __hcc_backend__==HCC_BACKEND_AMDGPU  3098   int index = 
self + delta;
  3099   index = (int)((
self&(width-1))+delta) >= width?
self:index;
  3106     __u tmp; tmp.u = var;
  3112     __u tmp; tmp.f = var;
  3139 #if __hcc_backend__==HCC_BACKEND_AMDGPU  3144   int index = 
self^laneMask;
  3145   index = index >= ((
self+width)&~(width-1))?
self:index;
  3152     __u tmp; tmp.f = var;
  3161     __u tmp; tmp.u = var;
  3173 inline unsigned int __mul24(
unsigned int x, 
unsigned int y) [[
hc]] {
  3174   return (x & 0x00FFFFFF) * (y & 0x00FFFFFF);
  3185   return  ((x << 8) >> 8) * ((y << 8) >> 8);
  3197 inline unsigned int __mad24(
unsigned int x, 
unsigned int y, 
unsigned int z) [[
hc]] {
  3214 inline void abort() __HC__ {
  3252 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3253 template <
typename Ker, 
typename Ti>
  3254 void bar_wrapper(Ker *f, Ti *t)
  3260     std::unique_ptr<ucontext_t[]> ctx;
  3263         ctx(
new ucontext_t[a + 1]) {}
  3264     template <
typename Ti, 
typename Ker>
  3265     void setctx(
int x, 
char *stack, Ker& f, Ti* tidx, 
int S) {
  3266         getcontext(&ctx[x]);
  3267         ctx[x].uc_stack.ss_sp = stack;
  3268         ctx[x].uc_stack.ss_size = S;
  3269         ctx[x].uc_link = &ctx[x - 1];
  3270         makecontext(&ctx[x], (
void (*)(
void))bar_wrapper<Ker, Ti>, 2, &f, tidx);
  3272     void swap(
int a, 
int b) {
  3273         swapcontext(&ctx[a], &ctx[b]);
  3275     void wait() __HC__ {
  3277         swapcontext(&ctx[idx + 1], &ctx[idx]);
  3296 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3297     using pb_t = std::shared_ptr<barrier_t>;
  3331 #if __KALMAR_ACCELERATOR__ == 1  3332         wait_with_all_memory_fence();
  3333 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3348 #if __KALMAR_ACCELERATOR__ == 1  3349         amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
  3350 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3365 #if __KALMAR_ACCELERATOR__ == 1  3366         amp_barrier(CLK_GLOBAL_MEM_FENCE);
  3367 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3383 #if __KALMAR_ACCELERATOR__ == 1  3384         amp_barrier(CLK_LOCAL_MEM_FENCE);
  3385 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3391 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3398     template <
int N> 
friend  3447     static const int rank = 3;
  3456     tiled_index(
const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}
  3504 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3505     __attribute__((always_inline)) 
tiled_index(
int a0, 
int a1, 
int a2, 
int b0, 
int b1, 
int b2, 
int c0, 
int c1, 
int c2, 
tile_barrier& pb, 
int D0, 
int D1, 
int D2) __CPU__ __HC__
  3506         : global(a2, a1, a0), local(b2, b1, b0), tile(c2, c1, c0), tile_origin(a2 - b2, a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1, D2) {}
  3509     __attribute__((annotate(
"__cxxamp_opencl_index")))
  3510 #if __KALMAR_ACCELERATOR__ == 1  3511     __attribute__((always_inline)) 
tiled_index() __HC__
  3512         : global(
index<3>(amp_get_global_id(2), amp_get_global_id(1), amp_get_global_id(0))),
  3513           local(
index<3>(amp_get_local_id(2), amp_get_local_id(1), amp_get_local_id(0))),
  3514           tile(
index<3>(amp_get_group_id(2), amp_get_group_id(1), amp_get_group_id(0))),
  3515           tile_origin(
index<3>(amp_get_global_id(2) - amp_get_local_id(2),
  3516                                amp_get_global_id(1) - amp_get_local_id(1),
  3517                                amp_get_global_id(0) - amp_get_local_id(0))),
  3518           tile_dim(
index<3>(amp_get_local_size(2), amp_get_local_size(1), amp_get_local_size(0)))
  3519 #elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3520     __attribute__((always_inline)) 
tiled_index() __CPU__ __HC__
  3522     __attribute__((always_inline)) 
tiled_index() __HC__
  3523 #endif // __KALMAR_ACCELERATOR__  3526     template<
typename Kernel> 
friend  3529 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3530     template<
typename K> 
friend  3548     static const int rank = 1;
  3557     tiled_index(
const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}
  3605 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3607         : global(a), local(b), tile(c), tile_origin(a - b), barrier(pb), tile_dim(D0) {}
  3610     __attribute__((annotate(
"__cxxamp_opencl_index")))
  3611 #if __KALMAR_ACCELERATOR__ == 1  3612     __attribute__((always_inline)) 
tiled_index() __HC__
  3613         : global(
index<1>(amp_get_global_id(0))),
  3614           local(
index<1>(amp_get_local_id(0))),
  3615           tile(
index<1>(amp_get_group_id(0))),
  3616           tile_origin(
index<1>(amp_get_global_id(0) - amp_get_local_id(0))),
  3617           tile_dim(
index<1>(amp_get_local_size(0)))
  3618 #elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3619     __attribute__((always_inline)) 
tiled_index() __CPU__ __HC__
  3621     __attribute__((always_inline)) 
tiled_index() __HC__
  3622 #endif // __KALMAR_ACCELERATOR__  3625     template<
typename Kernel> 
friend  3628 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3629     template<
typename K> 
friend  3646     static const int rank = 2;
  3655     tiled_index(
const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}
  3703 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3704     __attribute__((always_inline)) 
tiled_index(
int a0, 
int a1, 
int b0, 
int b1, 
int c0, 
int c1, 
tile_barrier& pb, 
int D0, 
int D1) __CPU__ __HC__
  3705         : global(a1, a0), local(b1, b0), tile(c1, c0), tile_origin(a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1) {}
  3708     __attribute__((annotate(
"__cxxamp_opencl_index")))
  3709 #if __KALMAR_ACCELERATOR__ == 1  3710     __attribute__((always_inline)) 
tiled_index() __HC__
  3711         : global(
index<2>(amp_get_global_id(1), amp_get_global_id(0))),
  3712           local(
index<2>(amp_get_local_id(1), amp_get_local_id(0))),
  3713           tile(
index<2>(amp_get_group_id(1), amp_get_group_id(0))),
  3714           tile_origin(
index<2>(amp_get_global_id(1) - amp_get_local_id(1),
  3715                                amp_get_global_id(0) - amp_get_local_id(0))),
  3716           tile_dim(
index<2>(amp_get_local_size(1), amp_get_local_size(0)))
  3717 #elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3718     __attribute__((always_inline)) 
tiled_index() __CPU__ __HC__
  3720     __attribute__((always_inline)) 
tiled_index() __HC__
  3721 #endif // __KALMAR_ACCELERATOR__  3724     template<
typename Kernel> 
friend  3727 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3728     template<
typename K> 
friend  3733 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  3734 #define SSIZE 1024 * 10  3735 template <
int N, 
typename Kernel,  
int K>
  3738     static inline void call(
const Kernel& k, 
index<K>& idx, 
const extent<K>& ext) __CPU__ __HC__ {
  3740         for (i = 0; i < ext[N]; ++i) {
  3742             cpu_helper<N + 1, Kernel, K>::call(k, idx, ext);
  3746 template <
typename Kernel, 
int K>
  3747 struct cpu_helper<K, Kernel, K>
  3749     static inline void call(
const Kernel& k, 
const index<K>& idx, 
const extent<K>& ext) __CPU__ __HC__ {
  3750         (
const_cast<Kernel&
>(k))(idx);
  3754 template <
typename Kernel, 
int N>
  3755 void partitioned_task(
const Kernel& ker, 
const extent<N>& ext, 
int part) {
  3757     int start = ext[0] * part / Kalmar::NTHREAD;
  3758     int end = ext[0] * (part + 1) / Kalmar::NTHREAD;
  3759     for (
int i = start; i < end; i++) {
  3761         cpu_helper<1, Kernel, N>::call(ker, idx, ext);
  3765 template <
typename Kernel>
  3766 void partitioned_task_tile_1D(Kernel 
const& f, 
tiled_extent<1> const& ext, 
int part) {
  3768     int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
  3769     int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
  3770     int stride = end - start;
  3773     char *stk = 
new char[D0 * SSIZE];
  3775     tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0);
  3777     for (
int tx = start; tx < end; tx++) {
  3781         for (
int x = 0; x < D0; x++) {
  3783             hc_bar->setctx(++
id, sp, f, tip, SSIZE);
  3788         while (hc_bar->idx == 0) {
  3790             hc_bar->swap(0, 
id);
  3797 template <
typename Kernel>
  3798 void partitioned_task_tile_2D(Kernel 
const& f, 
tiled_extent<2> const& ext, 
int part) {
  3801     int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
  3802     int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
  3803     int stride = end - start;
  3806     char *stk = 
new char[D1 * D0 * SSIZE];
  3808     tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0 * D1);
  3811     for (
int tx = 0; tx < ext[1] / D1; tx++)
  3812         for (
int ty = start; ty < end; ty++) {
  3816             for (
int x = 0; x < D1; x++)
  3817                 for (
int y = 0; y < D0; y++) {
  3818                     new (tip) 
tiled_index<2>(D1 * tx + x, D0 * ty + y, x, y, tx, ty, tbar, D0, D1);
  3819                     hc_bar->setctx(++
id, sp, f, tip, SSIZE);
  3824             while (hc_bar->idx == 0) {
  3826                 hc_bar->swap(0, 
id);
  3833 template <
typename Kernel>
  3834 void partitioned_task_tile_3D(Kernel 
const& f, 
tiled_extent<3> const& ext, 
int part) {
  3838     int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
  3839     int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
  3840     int stride = end - start;
  3843     char *stk = 
new char[D2 * D1 * D0 * SSIZE];
  3845     tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0 * D1 * D2);
  3848     for (
int i = 0; i < ext[2] / D2; i++)
  3849         for (
int j = 0; j < ext[1] / D1; j++)
  3850             for(
int k = start; k < end; k++) {
  3854                 for (
int x = 0; x < D2; x++)
  3855                     for (
int y = 0; y < D1; y++)
  3856                         for (
int z = 0; z < D0; z++) {
  3860                                                               x, y, z, i, j, k, tbar, D0, D1, D2);
  3861                             hc_bar->setctx(++
id, sp, f, tip, SSIZE);
  3866                 while (hc_bar->idx == 0) {
  3868                     hc_bar->swap(0, 
id);
  3875 template <
typename Kernel, 
int N>
  3876 completion_future launch_cpu_task_async(
const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel 
const& f,
  3879     Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
  3880     for (
int i = 0; i < Kalmar::NTHREAD; ++i)
  3881         obj[i] = std::thread(partitioned_task<Kernel, N>, std::cref(f), std::cref(compute_domain), i);
  3886 template <
typename Kernel>
  3887 completion_future launch_cpu_task_async(
const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel 
const& f,
  3890     Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
  3891     for (
int i = 0; i < Kalmar::NTHREAD; ++i)
  3892         obj[i] = std::thread(partitioned_task_tile_1D<Kernel>,
  3893                              std::cref(f), std::cref(compute_domain), i);
  3898 template <
typename Kernel>
  3899 completion_future launch_cpu_task_async(
const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel 
const& f,
  3902     Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
  3903     for (
int i = 0; i < Kalmar::NTHREAD; ++i)
  3904         obj[i] = std::thread(partitioned_task_tile_2D<Kernel>,
  3905                              std::cref(f), std::cref(compute_domain), i);
  3910 template <
typename Kernel>
  3911 completion_future launch_cpu_task_async(
const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel 
const& f,
  3914     Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
  3915     for (
int i = 0; i < Kalmar::NTHREAD; ++i)
  3916         obj[i] = std::thread(partitioned_task_tile_3D<Kernel>,
  3917                              std::cref(f), std::cref(compute_domain), i);
  3928 template <
typename T, 
int N>
  3933     static_assert(N > 1, 
"projection_helper is only supported on array_view with a rank of 2 or higher");
  3935     static result_type project(
array_view<T, N>& now, 
int stride) __CPU__ __HC__ {
  3936         int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
  3937         for (i = N - 1; i > 0; --i) {
  3938             ext_o[i - 1] = now.extent[i];
  3939             ext[i - 1] = now.extent_base[i];
  3940             idx[i - 1] = now.index_base[i];
  3942         stride += now.index_base[0];
  3943         extent<N - 1> ext_now(ext_o);
  3944         extent<N - 1> ext_base(ext);
  3945         index<N - 1> idx_base(idx);
  3946         return result_type (now.cache, ext_now, ext_base, idx_base,
  3947                             now.offset + ext_base.size() * stride);
  3949     static result_type project(
const array_view<T, N>& now, 
int stride) __CPU__ __HC__ {
  3950         int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
  3951         for (i = N - 1; i > 0; --i) {
  3952             ext_o[i - 1] = now.extent[i];
  3953             ext[i - 1] = now.extent_base[i];
  3954             idx[i - 1] = now.index_base[i];
  3956         stride += now.index_base[0];
  3957         extent<N - 1> ext_now(ext_o);
  3958         extent<N - 1> ext_base(ext);
  3959         index<N - 1> idx_base(idx);
  3960         return result_type (now.cache, ext_now, ext_base, idx_base,
  3961                             now.offset + ext_base.size() * stride);
  3965 template <
typename T>
  3970     typedef T& result_type;
  3972 #if __KALMAR_ACCELERATOR__ != 1  3973         now.cache.get_cpu_access(
true);
  3975         T *ptr = 
reinterpret_cast<T *
>(now.cache.get() + i + now.offset + now.index_base[0]);
  3978     static result_type project(
const array_view<T, 1>& now, 
int i) __CPU__ __HC__ {
  3979 #if __KALMAR_ACCELERATOR__ != 1  3980         now.cache.get_cpu_access(
true);
  3982         T *ptr = 
reinterpret_cast<T *
>(now.cache.get() + i + now.offset + now.index_base[0]);
  3987 template <
typename T, 
int N>
  3992     static_assert(N > 1, 
"projection_helper is only supported on array_view with a rank of 2 or higher");
  3995         int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
  3996         for (i = N - 1; i > 0; --i) {
  3997             ext_o[i - 1] = now.extent[i];
  3998             ext[i - 1] = now.extent_base[i];
  3999             idx[i - 1] = now.index_base[i];
  4001         stride += now.index_base[0];
  4002         extent<N - 1> ext_now(ext_o);
  4003         extent<N - 1> ext_base(ext);
  4004         index<N - 1> idx_base(idx);
  4005         auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
  4006                                       now.offset + ext_base.size() * stride);
  4010         int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
  4011         for (i = N - 1; i > 0; --i) {
  4012             ext_o[i - 1] = now.extent[i];
  4013             ext[i - 1] = now.extent_base[i];
  4014             idx[i - 1] = now.index_base[i];
  4016         stride += now.index_base[0];
  4017         extent<N - 1> ext_now(ext_o);
  4018         extent<N - 1> ext_base(ext);
  4019         index<N - 1> idx_base(idx);
  4020         auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
  4021                                       now.offset + ext_base.size() * stride);
  4026 template <
typename T>
  4031     typedef const T& const_result_type;
  4033 #if __KALMAR_ACCELERATOR__ != 1  4034         now.cache.get_cpu_access();
  4036         const T *ptr = 
reinterpret_cast<const T *
>(now.cache.get() + i + now.offset + now.index_base[0]);
  4040 #if __KALMAR_ACCELERATOR__ != 1  4041         now.cache.get_cpu_access();
  4043         const T *ptr = 
reinterpret_cast<const T *
>(now.cache.get() + i + now.offset + now.index_base[0]);
  4052 template <
typename T>
  4056     struct two {
char __lx; 
char __lxx;};
  4057     template <
typename C> 
static char test(decltype(std::declval<C>().data()));
  4058     template <
typename C> 
static two test(...);
  4060     static const bool value = 
sizeof(test<T>(0)) == 1;
  4063 template <
typename T>
  4067     struct two {
char __lx; 
char __lxx;};
  4068     template <
typename C> 
static char test(decltype(&C::size));
  4069     template <
typename C> 
static two test(...);
  4071     static const bool value = 
sizeof(test<T>(0)) == 1;
  4074 template <
typename T>
  4077     using _T = 
typename std::remove_reference<T>::type;
  4086 template <
typename T, 
int N>
  4092     static_assert(N > 1, 
"projection_helper is only supported on array with a rank of 2 or higher");
  4095     static result_type project(
array<T, N>& now, 
int stride) __CPU__ __HC__ {
  4096 #if __KALMAR_ACCELERATOR__ != 1  4101         for (i = N - 1; i > 0; --i)
  4102             comp[i - 1] = now.extent[i];
  4104         int offset = ext.
size() * stride;
  4105 #if __KALMAR_ACCELERATOR__ != 1  4106         if( offset >= now.extent.size())
  4109         return result_type(now.m_device, ext, ext, 
index<N - 1>(), offset);
  4111     static const_result_type project(
const array<T, N>& now, 
int stride) __CPU__ __HC__ {
  4113         for (i = N - 1; i > 0; --i)
  4114             comp[i - 1] = now.extent[i];
  4116         int offset = ext.
size() * stride;
  4117         return const_result_type(now.m_device, ext, ext, 
index<N - 1>(), offset);
  4121 template <
typename T>
  4127     typedef T& result_type;
  4128     typedef const T& const_result_type;
  4129     static result_type project(
array<T, 1>& now, 
int i) __CPU__ __HC__ {
  4130 #if __KALMAR_ACCELERATOR__ != 1  4131         now.m_device.synchronize(
true);
  4133         T *ptr = 
reinterpret_cast<T *
>(now.m_device.get() + i);
  4136     static const_result_type project(
const array<T, 1>& now, 
int i) __CPU__ __HC__ {
  4137 #if __KALMAR_ACCELERATOR__ != 1  4138         now.m_device.synchronize();
  4140         const T *ptr = 
reinterpret_cast<const T *
>(now.m_device.get() + i);
  4148 #if __KALMAR_ACCELERATOR__ != 1  4149     for (
int i = 0; i < N; i++)
  4162 template <
typename T, 
int N>
  4165 template <
typename T, 
int N>
  4168 template <
typename T, 
int N>
  4171 template <
typename T, 
int N>
  4174 template <
typename T, 
int N>
  4177 template <
typename T, 
int N>
  4180 template <
typename InputIter, 
typename T, 
int N>
  4183 template <
typename InputIter, 
typename T, 
int N>
  4186 template <
typename InputIter, 
typename T, 
int N>
  4189 template <
typename InputIter, 
typename T, 
int N>
  4192 template <
typename OutputIter, 
typename T, 
int N>
  4195 template <
typename OutputIter, 
typename T, 
int N>
  4209 template <
typename T, 
int N = 1>
  4211     static_assert(!std::is_const<T>::value, 
"array<const T> is not supported");
  4213 #if __KALMAR_ACCELERATOR__ == 1  4214     typedef Kalmar::_data<T> acc_buffer_t;
  4216     typedef Kalmar::_data_host<T> acc_buffer_t;
  4222     static const int rank = N;
  4243         : 
array(other.get_extent(), other.get_accelerator_view())
  4244     { 
copy(other, *
this); }
  4255     { other.m_device.reset(); }
  4275         : 
array(
hc::
extent<N>(e0)) { static_assert(N == 1, 
"illegal"); }
  4297     template <
typename InputIter>
  4300     template <
typename InputIter>
  4302             : 
array(ext, srcBegin, srcEnd, 
accelerator(L
"default").get_default_view()) {}
  4316     template <
typename InputIter>
  4319     template <
typename InputIter>
  4320         array(
int e0, InputIter srcBegin, InputIter srcEnd)
  4322     template <
typename InputIter>
  4323         array(
int e0, 
int e1, InputIter srcBegin)
  4325     template <
typename InputIter>
  4326         array(
int e0, 
int e1, InputIter srcBegin, InputIter srcEnd)
  4328     template <
typename InputIter>
  4329         array(
int e0, 
int e1, 
int e2, InputIter srcBegin)
  4331     template <
typename InputIter>
  4332         array(
int e0, 
int e1, 
int e2, InputIter srcBegin, InputIter srcEnd)
  4350     { 
copy(src, *
this); }
  4375 #if __KALMAR_ACCELERATOR__ == 1  4378         : m_device(av.pQueue, av.pQueue, check(ext).size(), cpu_access_type), 
extent(ext) {}
  4385     explicit array(
int e0, 
void* accelerator_pointer)
  4387     explicit array(
int e0, 
int e1, 
void* accelerator_pointer)
  4389     explicit array(
int e0, 
int e1, 
int e2, 
void* accelerator_pointer)
  4393         : 
array(ext, 
accelerator(L
"default").get_default_view(), accelerator_pointer) {}
  4406 #if __KALMAR_ACCELERATOR__ == 1  4407         : m_device(ext.
size(), accelerator_pointer), 
extent(ext) {}
  4409         : m_device(av.pQueue, av.pQueue, check(ext).size(), accelerator_pointer, cpu_access_type), 
extent(ext) {}
  4459     template <
typename InputIter>
  4461               access_type cpu_access_type = access_type_auto)
  4462         : 
array(ext, av, cpu_access_type) { 
copy(srcBegin, *
this); }
  4463     template <
typename InputIter>
  4464         array(
const extent<N>& ext, InputIter srcBegin, InputIter srcEnd,
  4466         : 
array(ext, av, cpu_access_type) {
  4467             if (ext.
size() < std::distance(srcBegin, srcEnd))
  4469             copy(srcBegin, srcEnd, *
this);
  4501         : 
array(src.get_extent(), av, cpu_access_type) { 
copy(src, *
this); }
  4516     template <
typename InputIter>
  4518             : 
array(
extent<N>(e0), srcBegin, av, cpu_access_type) {}
  4519     template <
typename InputIter>
  4520         array(
int e0, InputIter srcBegin, InputIter srcEnd, 
accelerator_view av, access_type cpu_access_type = access_type_auto)
  4521             : 
array(
extent<N>(e0), srcBegin, srcEnd, av, cpu_access_type) {}
  4522     template <
typename InputIter>
  4524             : 
array(
hc::
extent<N>(e0, e1), srcBegin, av, cpu_access_type) {}
  4525     template <
typename InputIter>
  4526         array(
int e0, 
int e1, InputIter srcBegin, InputIter srcEnd, 
accelerator_view av, access_type cpu_access_type = access_type_auto)
  4527             : 
array(
hc::
extent<N>(e0, e1), srcBegin, srcEnd, av, cpu_access_type) {}
  4528     template <
typename InputIter>
  4530             : 
array(
hc::
extent<N>(e0, e1, e2), srcBegin, av, cpu_access_type) {}
  4531     template <
typename InputIter>
  4532         array(
int e0, 
int e1, 
int e2, InputIter srcBegin, InputIter srcEnd, 
accelerator_view av, access_type cpu_access_type = access_type_auto)
  4533             : 
array(
hc::
extent<N>(e0, e1, e2), srcBegin, srcEnd, av, cpu_access_type) {}
  4550 #if __KALMAR_ACCELERATOR__ == 1  4553         : m_device(av.pQueue, associated_av.pQueue, check(ext).size(), access_type_auto), 
extent(ext) {}
  4592     template <
typename InputIter>
  4594             : 
array(ext, av, associated_av) { 
copy(srcBegin, *
this); }
  4595     template <
typename InputIter>
  4597             : 
array(ext, av, associated_av) {
  4598             if (ext.
size() < std::distance(srcBegin, srcEnd))
  4600             copy(srcBegin, srcEnd, *
this);
  4622         : 
array(src.get_extent(), av, associated_av)
  4623     { 
copy(src, *
this); }
  4639     template <
typename InputIter>
  4641             : 
array(
extent<N>(e0), srcBegin, av, associated_av) {}
  4642     template <
typename InputIter>
  4644             : 
array(
extent<N>(e0), srcBegin, srcEnd, av, associated_av) {}
  4645     template <
typename InputIter>
  4647             : 
array(
hc::
extent<N>(e0, e1), srcBegin, av, associated_av) {}
  4648     template <
typename InputIter>
  4650             : 
array(
hc::
extent<N>(e0, e1), srcBegin, srcEnd, av, associated_av) {}
  4651     template <
typename InputIter>
  4653             : 
array(
hc::
extent<N>(e0, e1, e2), srcBegin, av, associated_av) {}
  4654     template <
typename InputIter>
  4656             : 
array(
hc::
extent<N>(e0, e1, e2), srcBegin, srcEnd, av, associated_av) {}
  4691         if (
this != &other) {
  4693             *
this = std::move(arr);
  4706         if (
this != &other) {
  4708             m_device = other.m_device;
  4709             other.m_device.reset();
  4724         *
this = std::move(arr);
  4736 #if __KALMAR_ACCELERATOR__ != 1  4737         for(
int i = 0 ; i < N ; i++)
  4739             if (dest.extent[i] < this->extent[i] )
  4761 #if __KALMAR_ACCELERATOR__ != 1  4762         if (!m_device.get())
  4764         m_device.synchronize(
true);
  4766         return reinterpret_cast<T*
>(m_device.get());
  4776         return reinterpret_cast<T*
>(m_device.get_device_pointer());
  4786     operator std::vector<T>() 
const {
  4788         copy(*
this, vec.data());
  4789         return std::move(vec);
  4804 #ifndef __KALMAR_ACCELERATOR__  4805         if (!m_device.get())
  4807         m_device.synchronize(
true);
  4809         T *ptr = 
reinterpret_cast<T*
>(m_device.get());
  4813         return (*
this)[idx];
  4830 #if __KALMAR_ACCELERATOR__ != 1  4831         if (!m_device.get())
  4833         m_device.synchronize();
  4835         T *ptr = 
reinterpret_cast<T*
>(m_device.get());
  4839         return (*
this)[idx];
  4856         return (*
this)[
index<3>(i0, i1, i2)];
  4872     const T& 
operator()(
int i0, 
int i1, 
int i2) 
const __CPU__ __HC__ {
  4873         return (*
this)[
index<3>(i0, i1, i2)];
  4894         operator[] (
int i) __CPU__ __HC__ {
  4902         operator[] (
int i) 
const __CPU__ __HC__ {
  4931 #if __KALMAR_ACCELERATOR__ != 1  4936         return av.
section(origin, ext);
  4940         return av.
section(origin, ext);
  4950 #if __KALMAR_ACCELERATOR__ != 1  4990         static_assert(N == 1, 
"Rank must be 1");
  4994         static_assert(N == 1, 
"Rank must be 1");
  4998         static_assert(N == 2, 
"Rank must be 2");
  5002         static_assert(N == 2, 
"Rank must be 2");
  5006         static_assert(N == 3, 
"Rank must be 3");
  5010         static_assert(N == 3, 
"Rank must be 3");
  5037     template <
typename ElementType>
  5039 #if __KALMAR_ACCELERATOR__ != 1  5040             static_assert( ! (std::is_pointer<ElementType>::value ),
"can't use pointer in the kernel");
  5041             static_assert( ! (std::is_same<ElementType,short>::value ),
"can't use short in the kernel");
  5042             if( (
extent.
size() * 
sizeof(T)) % 
sizeof(ElementType))
  5045             int size = 
extent.
size() * 
sizeof(T) / 
sizeof(ElementType);
  5046             using buffer_type = 
typename array_view<ElementType, 1>::acc_buffer_t;
  5050     template <
typename ElementType>
  5052 #if __KALMAR_ACCELERATOR__ != 1  5053             static_assert( ! (std::is_pointer<ElementType>::value ),
"can't use pointer in the kernel");
  5054             static_assert( ! (std::is_same<ElementType,short>::value ),
"can't use short in the kernel");
  5056             int size = 
extent.
size() * 
sizeof(T) / 
sizeof(ElementType);
  5057             using buffer_type = 
typename array_view<ElementType, 1>::acc_buffer_t;
  5079 #if __KALMAR_ACCELERATOR__ != 1  5088 #if __KALMAR_ACCELERATOR__ != 1  5101     const acc_buffer_t& 
internal() 
const __CPU__ __HC__ { 
return m_device; }
  5102     int get_offset() 
const __CPU__ __HC__ { 
return 0; }
  5107     acc_buffer_t m_device;
  5110     template <
typename Q, 
int K> 
friend  5112     template <
typename Q, 
int K> 
friend  5126 template <
typename T, 
int N = 1>
  5130     typedef typename std::remove_const<T>::type nc_T;
  5131 #if __KALMAR_ACCELERATOR__ == 1  5132     typedef Kalmar::_data<T> acc_buffer_t;
  5134     typedef Kalmar::_data_host<T> acc_buffer_t;
  5140     static const int rank = N;
  5179     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
  5182         { static_assert( std::is_same<decltype(src.data()), T*>::value, 
"container element type and array view element type must match"); }
  5195 #if __KALMAR_ACCELERATOR__ == 1  5196         : cache((T *)(src)), 
extent(ext), extent_base(ext), offset(0) {}
  5198         : cache(ext.
size(), (T *)(src)), 
extent(ext), extent_base(ext), offset(0) {}
  5212         : cache(ext.size()), 
extent(ext), extent_base(ext), offset(0) {}
  5224     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
  5227     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
  5230     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
  5231         array_view(
int e0, 
int e1, 
int e2, Container& src)
  5246     array_view(
int e0, 
int e1, value_type *src) __CPU__ __HC__
  5248     array_view(
int e0, 
int e1, 
int e2, value_type *src) __CPU__ __HC__
  5259     explicit array_view(
int e0, 
int e1)
  5261     explicit array_view(
int e0, 
int e1, 
int e2)
  5273         : cache(other.cache), 
extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
  5299     array_view& 
operator=(
const array_view& other) __CPU__ __HC__ {
  5300         if (
this != &other) {
  5301             cache = other.cache;
  5303             index_base = other.index_base;
  5304             extent_base = other.extent_base;
  5305             offset = other.offset;
  5318 #if __KALMAR_ACCELERATOR__ != 1  5319         for(
int i= 0 ;i< N;i++)
  5354 #if __KALMAR_ACCELERATOR__ != 1  5355         cache.get_cpu_access(
true);
  5357         static_assert(N == 1, 
"data() is only permissible on array views of rank 1");
  5358         return reinterpret_cast<T*
>(cache.get() + offset + index_base[0]);
  5368         return reinterpret_cast<T*
>(cache.get_device_pointer() + offset + index_base[0]);
  5426         std::future<void> fut = std::async([&]() 
mutable { synchronize(); });
  5465 #if __KALMAR_ACCELERATOR__ != 1  5466         cache.sync_to(av.pQueue);
  5496 #if __KALMAR_ACCELERATOR__ != 1  5509     T& operator[] (
const index<N>& idx) 
const __CPU__ __HC__ {
  5510 #if __KALMAR_ACCELERATOR__ != 1  5511         cache.get_cpu_access(
true);
  5513         T *ptr = 
reinterpret_cast<T*
>(cache.get() + offset);
  5514         return ptr[Kalmar::amp_helper<N, index<N>, 
hc::extent<N>>::flatten(idx + index_base, extent_base)];
  5518         return (*
this)[idx];
  5535     T& get_ref(
const index<N>& idx) 
const __CPU__ __HC__;
  5545     T& operator() (
int i0, 
int i1) 
const __CPU__ __HC__ {
  5546         static_assert(N == 2, 
"T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
  5549     T& operator() (
int i0, 
int i1, 
int i2) 
const __CPU__ __HC__ {
  5550         static_assert(N == 3, 
"T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
  5551         return (*
this)[
index<3>(i0, i1, i2)];
  5576         operator[] (
int i) 
const __CPU__ __HC__ {
  5580         operator() (
int i0) 
const __CPU__ __HC__ { 
return (*
this)[i0]; }
  5603                              const extent<N>& ext) 
const __CPU__ __HC__ {
  5604 #if __KALMAR_ACCELERATOR__ != 1  5617         Kalmar::amp_helper<N, index<N>, 
hc::extent<N>>::minus(idx, ext);
  5618         return section(idx, ext);
  5626         return section(idx, ext);
  5640         static_assert(N == 1, 
"Rank must be 1");
  5645         static_assert(N == 2, 
"Rank must be 2");
  5650         static_assert(N == 3, 
"Rank must be 3");
  5667     template <
typename ElementType>
  5669             static_assert(N == 1, 
"reinterpret_as is only permissible on array views of rank 1");
  5670 #if __KALMAR_ACCELERATOR__ != 1  5671             static_assert( ! (std::is_pointer<ElementType>::value ),
"can't use pointer in the kernel");
  5672             static_assert( ! (std::is_same<ElementType,short>::value ),
"can't use short in the kernel");
  5673             if ( (
extent.
size() * 
sizeof(T)) % 
sizeof(ElementType))
  5676             int size = 
extent.
size() * 
sizeof(T) / 
sizeof(ElementType);
  5677             using buffer_type = 
typename array_view<ElementType, 1>::acc_buffer_t;
  5680                                           (offset + index_base[0])* 
sizeof(T) / 
sizeof(ElementType));
  5694             static_assert(N == 1, 
"view_as is only permissible on array views of rank 1");
  5695 #if __KALMAR_ACCELERATOR__ != 1  5703     ~array_view() __CPU__ __HC__ {}
  5706     const acc_buffer_t& 
internal() 
const __CPU__ __HC__ { 
return cache; }
  5708     int get_offset() 
const __CPU__ __HC__ { 
return offset; }
  5710     index<N> get_index_base() 
const __CPU__ __HC__ { 
return index_base; }
  5715     template <
typename Q, 
int K> 
friend class array;
  5716     template <
typename Q, 
int K> 
friend class array_view;
  5718     template<
typename Q, 
int K> 
friend  5720     template <
typename Q, 
int K> 
friend  5722     template <
typename InputIter, 
typename Q, 
int K> 
friend  5724     template <
typename Q, 
int K> 
friend  5726     template <
typename OutputIter, 
typename Q, 
int K> 
friend  5728     template <
typename Q, 
int K> 
friend  5732     array_view(
const acc_buffer_t& cache, 
const hc::extent<N>& ext,
  5733                int offset) __CPU__ __HC__
  5734         : cache(cache), 
extent(ext), extent_base(ext), offset(offset) {}
  5737     array_view(
const acc_buffer_t& cache, 
const hc::extent<N>& ext_now,
  5739                const index<N>& idx_b, 
int off) __CPU__ __HC__
  5740         : cache(cache), 
extent(ext_now), extent_base(ext_b), index_base(idx_b),
  5761 template <
typename T, 
int N>
  5762 class array_view<const T, N>
  5765     typedef typename std::remove_const<T>::type nc_T;
  5767 #if __KALMAR_ACCELERATOR__ == 1  5768   typedef Kalmar::_data<nc_T> acc_buffer_t;
  5770   typedef Kalmar::_data_host<const T> acc_buffer_t;
  5776     static const int rank = N;
  5786     array_view() = 
delete;
  5797         : cache(src.internal()), 
extent(src.get_extent()), extent_base(
extent), index_base(), offset(0) {}
  5815     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
  5817             : array_view(extent, src.data())
  5818         { static_assert( std::is_same<
typename std::remove_const<
typename std::remove_reference<decltype(*src.data())>::type>::type, T>::value, 
"container element type and array view element type must match"); }
  5831 #if __KALMAR_ACCELERATOR__ == 1  5832         : cache((nc_T*)(src)), 
extent(ext), extent_base(ext), offset(0) {}
  5834         : cache(ext.
size(), src), 
extent(ext), extent_base(ext), offset(0) {}
  5847     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
  5849     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
  5850         array_view(
int e0, 
int e1, Container& src)
  5852     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
  5853         array_view(
int e0, 
int e1, 
int e2, Container& src)
  5868     array_view(
int e0, 
int e1, 
const value_type *src) __CPU__ __HC__
  5870     array_view(
int e0, 
int e1, 
int e2, 
const value_type *src) __CPU__ __HC__
  5882         : cache(other.cache), 
extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
  5892         : cache(other.cache), 
extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
  5920         cache = other.cache;
  5922         index_base = other.index_base;
  5923         extent_base = other.extent_base;
  5924         offset = other.offset;
  5928     array_view& 
operator=(
const array_view& other) __CPU__ __HC__ {
  5929         if (
this != &other) {
  5930             cache = other.cache;
  5932             index_base = other.index_base;
  5933             extent_base = other.extent_base;
  5934             offset = other.offset;
  5974     const T* 
data() const __CPU__ __HC__ {
  5975 #if __KALMAR_ACCELERATOR__ != 1  5976         cache.get_cpu_access();
  5978         static_assert(N == 1, 
"data() is only permissible on array views of rank 1");
  5979         return reinterpret_cast<const T*
>(cache.get() + offset + index_base[0]);
  5989         return reinterpret_cast<const T*
>(cache.get_device_pointer() + offset + index_base[0]);
  6041         std::future<void> fut = std::async([&]() 
mutable { synchronize(); });
  6057 #if __KALMAR_ACCELERATOR__ != 1  6058         cache.sync_to(av.pQueue);
  6089 #if __KALMAR_ACCELERATOR__ != 1  6090         cache.get_cpu_access();
  6092         const T *ptr = 
reinterpret_cast<const T*
>(cache.get() + offset);
  6093         return ptr[Kalmar::amp_helper<N, index<N>, 
hc::extent<N>>::flatten(idx + index_base, extent_base)];
  6096         return (*
this)[idx];
  6113     const T& get_ref(
const index<N>& idx) 
const __CPU__ __HC__;
  6124         static_assert(N == 1, 
"const T& array_view::operator()(int) is only permissible on array_view<T, 1>");
  6129         static_assert(N == 2, 
"const T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
  6132     const T& 
operator()(
int i0, 
int i1, 
int i2) 
const __CPU__ __HC__ {
  6133         static_assert(N == 3, 
"const T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
  6134         return (*
this)[
index<3>(i0, i1, i2)];
  6159         operator[] (
int i) 
const __CPU__ __HC__ {
  6188                                    const extent<N>& ext) 
const __CPU__ __HC__ {
  6198         Kalmar::amp_helper<N, index<N>, 
hc::extent<N>>::minus(idx, ext);
  6199         return section(idx, ext);
  6207         return section(idx, ext);
  6221         static_assert(N == 1, 
"Rank must be 1");
  6226         static_assert(N == 2, 
"Rank must be 2");
  6231         static_assert(N == 3, 
"Rank must be 3");
  6248     template <
typename ElementType>
  6250             static_assert(N == 1, 
"reinterpret_as is only permissible on array views of rank 1");
  6251 #if __KALMAR_ACCELERATOR__ != 1  6252             static_assert( ! (std::is_pointer<ElementType>::value ),
"can't use pointer in the kernel");
  6253             static_assert( ! (std::is_same<ElementType,short>::value ),
"can't use short in the kernel");
  6255             int size = 
extent.
size() * 
sizeof(T) / 
sizeof(ElementType);
  6256             using buffer_type = 
typename array_view<ElementType, 1>::acc_buffer_t;
  6259                                                 (offset + index_base[0])* 
sizeof(T) / 
sizeof(ElementType));
  6273             static_assert(N == 1, 
"view_as is only permissible on array views of rank 1");
  6274 #if __KALMAR_ACCELERATOR__ != 1  6282     ~array_view() __CPU__ __HC__ {}
  6285     const acc_buffer_t& 
internal() 
const __CPU__ __HC__ { 
return cache; }
  6287     int get_offset() 
const __CPU__ __HC__ { 
return offset; }
  6289     index<N> get_index_base() 
const __CPU__ __HC__ { 
return index_base; }
  6294     template <
typename Q, 
int K> 
friend class array;
  6295     template <
typename Q, 
int K> 
friend class array_view;
  6297     template<
typename Q, 
int K> 
friend  6299     template <
typename Q, 
int K> 
friend  6301     template <
typename InputIter, 
typename Q, 
int K>
  6303     template <
typename Q, 
int K> 
friend  6305     template <
typename OutputIter, 
typename Q, 
int K> 
friend  6307     template <
typename Q, 
int K> 
friend  6311     array_view(
const acc_buffer_t& cache, 
const hc::extent<N>& ext,
  6312                int offset) __CPU__ __HC__
  6313         : cache(cache), 
extent(ext), extent_base(ext), offset(offset) {}
  6316     array_view(
const acc_buffer_t& cache, 
const hc::extent<N>& ext_now,
  6318                const index<N>& idx_b, 
int off) __CPU__ __HC__
  6319         : cache(cache), 
extent(ext_now), extent_base(ext_b), index_base(idx_b),
  6333 template<
typename T, 
int N>
  6335     return av.extent == av.extent_base && av.index_base == 
index<N>();
  6338 template<
typename T>
  6339 static inline bool is_flat(
const array_view<T, 1>& av) noexcept { 
return true; }
  6341 template <
typename InputIter, 
typename T, 
int N, 
int dim>
  6344     void operator()(InputIter& It, T* ptr, 
const extent<N>& ext,
  6348         for (
int i = dim; i < N; i++)
  6350         ptr += stride * idx[dim - 1];
  6351         for (
int i = 0; i < ext[dim - 1]; i++) {
  6358 template <
typename InputIter, 
typename T, 
int N>
  6361     void operator()(InputIter& It, T* ptr, 
const extent<N>& ext,
  6365         std::advance(end, ext[N - 1]);
  6371 template <
typename OutputIter, 
typename T, 
int N, 
int dim>
  6374     void operator()(
const T* ptr, OutputIter& It, 
const extent<N>& ext,
  6378         for (
int i = dim; i < N; i++)
  6380         ptr += stride * idx[dim - 1];
  6381         for (
int i = 0; i < ext[dim - 1]; i++) {
  6388 template <
typename OutputIter, 
typename T, 
int N>
  6391     void operator()(
const T* ptr, OutputIter& It, 
const extent<N>& ext,
  6395         It = 
std::copy(ptr, ptr + ext[N - 1], It);
  6399 template <
typename T, 
int N, 
int dim>
  6402     void operator()(
const T* src, T* dst, 
const extent<N>& ext,
  6407         for (
int i = dim; i < N; i++)
  6408             stride1 *= base1[i];
  6409         src += stride1 * idx1[dim - 1];
  6412         for (
int i = dim; i < N; i++)
  6413             stride2 *= base2[i];
  6414         dst += stride2 * idx2[dim - 1];
  6416         for (
int i = 0; i < ext[dim - 1]; i++) {
  6424 template <
typename T, 
int N>
  6427     void operator()(
const T* src, T* dst, 
const extent<N>& ext,
  6437 template <
typename Iter, 
typename T, 
int N>
  6440     template<
template <
typename, 
int> 
class _amp_container>
  6441     void operator()(Iter srcBegin, Iter srcEnd, 
const _amp_container<T, N>& dest) {
  6442         size_t size = dest.get_extent().size();
  6443         size_t offset = dest.get_offset();
  6446         T* ptr = dest.internal().map_ptr(modify, size, offset);
  6448         dest.internal().unmap_ptr(ptr, modify, size, offset);
  6450     template<
template <
typename, 
int> 
class _amp_container>
  6451     void operator()(
const _amp_container<T, N> &src, Iter destBegin) {
  6452         size_t size = src.get_extent().size();
  6453         size_t offset = src.get_offset();
  6454         bool modify = 
false;
  6456         const T* ptr = src.internal().map_ptr(modify, size, offset);
  6457         std::copy(ptr, ptr + src.get_extent().size(), destBegin);
  6458         src.internal().unmap_ptr(ptr, modify, size, offset);
  6462 template <
typename Iter, 
typename T>
  6465     template<
template <
typename, 
int> 
class _amp_container>
  6466     void operator()(Iter srcBegin, Iter srcEnd, 
const _amp_container<T, 1>& dest) {
  6467         size_t size = dest.get_extent().size();
  6468         size_t offset = dest.get_offset() + dest.get_index_base()[0];
  6471         T* ptr = dest.internal().map_ptr(modify, size, offset);
  6473         dest.internal().unmap_ptr(ptr, modify, size, offset);
  6475     template<
template <
typename, 
int> 
class _amp_container>
  6476     void operator()(
const _amp_container<T, 1> &src, Iter destBegin) {
  6477         size_t size = src.get_extent().size();
  6478         size_t offset = src.get_offset() + src.get_index_base()[0];
  6479         bool modify = 
false;
  6481         const T* ptr = src.internal().map_ptr(modify, size, offset);
  6482         std::copy(ptr, ptr + src.get_extent().size(), destBegin);
  6483         src.internal().unmap_ptr(ptr, modify, size, offset);
  6487 template <
typename T, 
int N>
  6490     template<
template <
typename, 
int> 
class _amp_container>
  6491     void operator()(T* srcBegin, T* srcEnd, 
const _amp_container<T, N>& dest) {
  6492         dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd), dest.get_offset(), 
true);
  6494     template<
template <
typename, 
int> 
class _amp_container>
  6495     void operator()(
const _amp_container<T, N> &src, T* destBegin) {
  6496         src.internal().read(destBegin, src.get_extent().size(), src.get_offset());
  6500 template <
typename T>
  6503     template<
template <
typename, 
int> 
class _amp_container>
  6504     void operator()(
const T* srcBegin, 
const T* srcEnd, 
const _amp_container<T, 1>& dest) {
  6505         dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd),
  6506                               dest.get_offset() + dest.get_index_base()[0], 
true);
  6508     template<
template <
typename, 
int> 
class _amp_container>
  6509     void operator()(
const _amp_container<T, 1> &src, T* destBegin) {
  6510         src.internal().read(destBegin, src.get_extent().size(),
  6511                             src.get_offset() + src.get_index_base()[0]);
  6527 template <
typename T, 
int N>
  6529     src.internal().copy(dest.internal(), 0, 0, 0);
  6540 template <
typename T, 
int N>
  6543         src.internal().copy(dest.internal(), src.get_offset(),
  6544                             dest.get_offset(), dest.
get_extent().size());
  6547         size_t srcSize = src.extent.size();
  6548         size_t srcOffset = 0;
  6549         bool srcModify = 
false;
  6550         size_t destSize = dest.extent_base.size();
  6551         size_t destOffset = dest.offset;
  6552         bool destModify = 
true;
  6554         T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
  6556         T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
  6558         dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
  6559         src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
  6563 template <
typename T>
  6565     src.internal().copy(dest.internal(),
  6566                         src.get_offset() + src.get_index_base()[0],
  6567                         dest.get_offset() + dest.get_index_base()[0],
  6582 template <
typename T, 
int N>
  6585         src.internal().copy(dest.internal(), src.get_offset(),
  6586                             dest.get_offset(), dest.
get_extent().size());
  6589         size_t srcSize = src.extent_base.size();
  6590         size_t srcOffset = src.offset;
  6591         bool srcModify = 
false;
  6592         size_t destSize = dest.extent.size();
  6593         size_t destOffset = 0;
  6594         bool destModify = 
true;
  6596         T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
  6598         const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
  6600         src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
  6601         dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
  6605 template <
typename T, 
int N>
  6611 template <
typename T>
  6613     src.internal().copy(dest.internal(),
  6614                         src.get_offset() + src.get_index_base()[0],
  6615                         dest.get_offset() + dest.get_index_base()[0],
  6630 template <
typename T, 
int N>
  6634             src.internal().copy(dest.internal(), src.get_offset(),
  6635                                 dest.get_offset(), dest.
get_extent().size());
  6638             size_t srcSize = src.extent.size();
  6639             size_t srcOffset = 0;
  6640             bool srcModify = 
false;
  6641             size_t destSize = dest.extent_base.size();
  6642             size_t destOffset = dest.offset;
  6643             bool destModify = 
true;
  6645             const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
  6647             T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
  6649             dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
  6650             src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
  6653         if (is_flat(dest)) {
  6655             size_t srcSize = src.extent_base.size();
  6656             size_t srcOffset = src.offset;
  6657             bool srcModify = 
false;
  6658             size_t destSize = dest.extent.size();
  6659             size_t destOffset = 0;
  6660             bool destModify = 
true;
  6662             T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
  6664             const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
  6666             dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
  6667             src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
  6670             size_t srcSize = src.extent_base.size();
  6671             size_t srcOffset = src.offset;
  6672             bool srcModify = 
false;
  6673             size_t destSize = dest.extent_base.size();
  6674             size_t destOffset = dest.offset;
  6675             bool destModify = 
true;
  6677             const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
  6678             T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
  6680                                   src.index_base, dest.extent_base, dest.index_base);
  6681             dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
  6682             src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
  6687 template <
typename T, 
int N>
  6693 template <
typename T>
  6695     src.internal().copy(dest.internal(),
  6696                         src.get_offset() + src.get_index_base()[0],
  6697                         dest.get_offset() + dest.get_index_base()[0],
  6717 template <
typename InputIter, 
typename T, 
int N>
  6719 #if __KALMAR_ACCELERATOR__ != 1  6720     if( ( std::distance(srcBegin,srcEnd) <=0 )||( std::distance(srcBegin,srcEnd) < dest.
get_extent().size() ))
  6726 template <
typename InputIter, 
typename T, 
int N>
  6728     InputIter srcEnd = srcBegin;
  6729     std::advance(srcEnd, dest.
get_extent().size());
  6730     copy(srcBegin, srcEnd, dest);
  6749 template <
typename InputIter, 
typename T, 
int N>
  6754         size_t size = dest.extent_base.size();
  6755         size_t offset = dest.offset;
  6758         T* ptr = dest.internal().map_ptr(modify, size, offset);
  6760         dest.internal().unmap_ptr(ptr, modify, size, offset);
  6764 template <
typename InputIter, 
typename T, 
int N>
  6766     InputIter srcEnd = srcBegin;
  6767     std::advance(srcEnd, dest.
get_extent().size());
  6768     copy(srcBegin, srcEnd, dest);
  6783 template <
typename OutputIter, 
typename T, 
int N>
  6798 template <
typename OutputIter, 
typename T, 
int N>
  6803         size_t size = src.extent_base.size();
  6804         size_t offset = src.offset;
  6805         bool modify = 
false;
  6807         T* ptr = src.internal().map_ptr(modify, size, offset);
  6809         src.internal().unmap_ptr(ptr, modify, size, offset);
  6830 template <
typename T, 
int N>
  6832     std::future<void> fut = std::async(std::launch::deferred, [&]() 
mutable { 
copy(src, dest); });
  6843 template <
typename T, 
int N>
  6845     std::future<void> fut = std::async(std::launch::deferred, [&]() 
mutable { 
copy(src, dest); });
  6858 template <
typename T, 
int N>
  6860     std::future<void> fut = std::async(std::launch::deferred, [&]() 
mutable { 
copy(src, dest); });
  6864 template <
typename T, 
int N>
  6866     std::future<void> fut = std::async(std::launch::deferred, [&]() 
mutable { 
copy(src, dest); });
  6881 template <
typename T, 
int N>
  6883     std::future<void> fut = std::async(std::launch::deferred, [&]() 
mutable { 
copy(src, dest); });
  6887 template <
typename T, 
int N>
  6889     std::future<void> fut = std::async(std::launch::deferred, [&]() 
mutable { 
copy(src, dest); });
  6909 template <
typename InputIter, 
typename T, 
int N>
  6911     std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]() 
mutable { 
copy(srcBegin, srcEnd, dest); });
  6915 template <
typename InputIter, 
typename T, 
int N>
  6917     std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]() 
mutable { 
copy(srcBegin, dest); });
  6937 template <
typename InputIter, 
typename T, 
int N>
  6939     std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]() 
mutable { 
copy(srcBegin, srcEnd, dest); });
  6943 template <
typename InputIter, 
typename T, 
int N>
  6945     std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]() 
mutable { 
copy(srcBegin, dest); });
  6961 template <
typename OutputIter, 
typename T, 
int N>
  6963     std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]() 
mutable { 
copy(src, destBegin); });
  6977 template <
typename OutputIter, 
typename T, 
int N>
  6979     std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]() 
mutable { 
copy(src, destBegin); });
  6985 template <
typename T, 
int N>
  6987     std::future<void> fut = std::async(std::launch::deferred, [&]() 
mutable { 
copy(src, dest); });
  6991 template <
typename T, 
int N>
  6993     std::future<void> fut = std::async(std::launch::deferred, [&]() 
mutable { 
copy(src, dest); });
  6997 template <
typename T, 
int N>
  6999     std::future<void> fut = std::async(std::launch::deferred, [&]() 
mutable { 
copy(src, dest); });
  7022 #if __KALMAR_ACCELERATOR__ == 1  7023 extern "C" unsigned int atomic_exchange_unsigned(
unsigned int *p, 
unsigned int val) __HC__;
  7024 extern "C" int atomic_exchange_int(
int *p, 
int val) __HC__;
  7025 extern "C" float atomic_exchange_float(
float *p, 
float val) __HC__;
  7026 extern "C" uint64_t atomic_exchange_uint64(uint64_t *p, uint64_t val) __HC__;
  7028 static inline unsigned int atomic_exchange(
unsigned int * dest, 
unsigned int val) __CPU__ __HC__ {
  7029   return atomic_exchange_unsigned(dest, val);
  7031 static inline int atomic_exchange(
int * dest, 
int val) __CPU__ __HC__ {
  7032   return atomic_exchange_int(dest, val);
  7034 static inline float atomic_exchange(
float * dest, 
float val) __CPU__ __HC__ {
  7035   return atomic_exchange_float(dest, val);
  7037 static inline uint64_t 
atomic_exchange(uint64_t * dest, uint64_t val) __CPU__ __HC__ {
  7038   return atomic_exchange_uint64(dest, val);
  7040 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7041 unsigned int atomic_exchange_unsigned(
unsigned int *p, 
unsigned int val);
  7042 int atomic_exchange_int(
int *p, 
int val);
  7043 float atomic_exchange_float(
float *p, 
float val);
  7044 uint64_t atomic_exchange_uint64(uint64_t *p, uint64_t val);
  7046 static inline unsigned int atomic_exchange(
unsigned int *dest, 
unsigned int val) __CPU__ __HC__ {
  7047   return atomic_exchange_unsigned(dest, val);
  7049 static inline int atomic_exchange(
int *dest, 
int val) __CPU__ __HC__ {
  7050   return atomic_exchange_int(dest, val);
  7052 static inline float atomic_exchange(
float *dest, 
float val) __CPU__ __HC__ {
  7053   return atomic_exchange_float(dest, val);
  7055 static inline uint64_t 
atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__ {
  7056   return atomic_exchange_uint64(dest, val);
  7059 extern unsigned int atomic_exchange(
unsigned int *dest, 
unsigned int val) __CPU__ __HC__;
  7062 extern uint64_t 
atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__;
  7097 #if __KALMAR_ACCELERATOR__ == 1  7098 extern "C" unsigned int atomic_compare_exchange_unsigned(
unsigned int *dest, 
unsigned int expected_val, 
unsigned int val) __HC__;
  7099 extern "C" int atomic_compare_exchange_int(
int *dest, 
int expected_val, 
int val) __HC__;
  7100 extern "C" uint64_t atomic_compare_exchange_uint64(uint64_t *dest, uint64_t expected_val, uint64_t val) __HC__;
  7102 static inline bool atomic_compare_exchange(
unsigned int *dest, 
unsigned int *expected_val, 
unsigned int val) __CPU__ __HC__ {
  7103   *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
  7104   return (*dest == val);
  7107   *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
  7108   return (*dest == val);
  7110 static inline bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__ {
  7111   *expected_val = atomic_compare_exchange_uint64(dest, *expected_val, val);
  7112   return (*dest == val);
  7114 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7115 unsigned int atomic_compare_exchange_unsigned(
unsigned int *dest, 
unsigned int expected_val, 
unsigned int val);
  7116 int atomic_compare_exchange_int(
int *dest, 
int expected_val, 
int val);
  7117 uint64_t atomic_compare_exchange_uint64(uint64_t *dest, uint64_t expected_val, uint64_t val);
  7119 static inline bool atomic_compare_exchange(
unsigned int *dest, 
unsigned int *expected_val, 
unsigned int val) __CPU__ __HC__ {
  7120   *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
  7121   return (*dest == val);
  7124   *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
  7125   return (*dest == val);
  7127 static inline bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__ {
  7128   *expected_val = atomic_compare_exchange_uint64(dest, *expected_val, val);
  7129   return (*dest == val);
  7132 extern bool atomic_compare_exchange(
unsigned int *dest, 
unsigned int *expected_val, 
unsigned int val) __CPU__ __HC__;
  7166 #if __KALMAR_ACCELERATOR__ == 1  7167 extern "C" unsigned int atomic_add_unsigned(
unsigned int *p, 
unsigned int val) __HC__;
  7168 extern "C" int atomic_add_int(
int *p, 
int val) __HC__;
  7169 extern "C" float atomic_add_float(
float *p, 
float val) __HC__;
  7170 extern "C" uint64_t atomic_add_uint64(uint64_t *p, uint64_t val) __HC__;
  7172 static inline unsigned int atomic_fetch_add(
unsigned int *x, 
unsigned int y) __CPU__ __HC__ {
  7173   return atomic_add_unsigned(x, y);
  7176   return atomic_add_int(x, y);
  7179   return atomic_add_float(x, y);
  7181 static inline uint64_t 
atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__ {
  7182   return atomic_add_uint64(x, y);
  7185 extern "C" unsigned int atomic_sub_unsigned(
unsigned int *p, 
unsigned int val) __HC__;
  7186 extern "C" int atomic_sub_int(
int *p, 
int val) __HC__;
  7187 extern "C" float atomic_sub_float(
float *p, 
float val) __HC__;
  7189 static inline unsigned int atomic_fetch_sub(
unsigned int *x, 
unsigned int y) __CPU__ __HC__ {
  7190   return atomic_sub_unsigned(x, y);
  7193   return atomic_sub_int(x, y);
  7196   return atomic_sub_float(x, y);
  7199 extern "C" unsigned int atomic_and_unsigned(
unsigned int *p, 
unsigned int val) __HC__;
  7200 extern "C" int atomic_and_int(
int *p, 
int val) __HC__;
  7201 extern "C" uint64_t atomic_and_uint64(uint64_t *p, uint64_t val) __HC__;
  7203 static inline unsigned int atomic_fetch_and(
unsigned int *x, 
unsigned int y) __CPU__ __HC__ {
  7204   return atomic_and_unsigned(x, y);
  7207   return atomic_and_int(x, y);
  7209 static inline uint64_t 
atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__ {
  7210   return atomic_and_uint64(x, y);
  7213 extern "C" unsigned int atomic_or_unsigned(
unsigned int *p, 
unsigned int val) __HC__;
  7214 extern "C" int atomic_or_int(
int *p, 
int val) __HC__;
  7215 extern "C" uint64_t atomic_or_uint64(uint64_t *p, uint64_t val) __HC__;
  7217 static inline unsigned int atomic_fetch_or(
unsigned int *x, 
unsigned int y) __CPU__ __HC__ {
  7218   return atomic_or_unsigned(x, y);
  7221   return atomic_or_int(x, y);
  7223 static inline uint64_t 
atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__ {
  7224   return atomic_or_uint64(x, y);
  7227 extern "C" unsigned int atomic_xor_unsigned(
unsigned int *p, 
unsigned int val) __HC__;
  7228 extern "C" int atomic_xor_int(
int *p, 
int val) __HC__;
  7229 extern "C" uint64_t atomic_xor_uint64(uint64_t *p, uint64_t val) __HC__;
  7231 static inline unsigned int atomic_fetch_xor(
unsigned int *x, 
unsigned int y) __CPU__ __HC__ {
  7232   return atomic_xor_unsigned(x, y);
  7235   return atomic_xor_int(x, y);
  7237 static inline uint64_t 
atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__ {
  7238   return atomic_xor_uint64(x, y);
  7240 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7241 unsigned int atomic_add_unsigned(
unsigned int *p, 
unsigned int val);
  7242 int atomic_add_int(
int *p, 
int val);
  7243 float atomic_add_float(
float *p, 
float val);
  7244 uint64_t atomic_add_uint64(uint64_t *p, uint64_t val);
  7246 static inline unsigned int atomic_fetch_add(
unsigned int *x, 
unsigned int y) __CPU__ __HC__ {
  7247   return atomic_add_unsigned(x, y);
  7250   return atomic_add_int(x, y);
  7253   return atomic_add_float(x, y);
  7255 static inline uint64_t 
atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__ {
  7256   return atomic_add_uint64(x, y);
  7259 unsigned int atomic_sub_unsigned(
unsigned int *p, 
unsigned int val);
  7260 int atomic_sub_int(
int *p, 
int val);
  7261 float atomic_sub_float(
float *p, 
float val);
  7263 static inline unsigned int atomic_fetch_sub(
unsigned int *x, 
unsigned int y) __CPU__ __HC__ {
  7264   return atomic_sub_unsigned(x, y);
  7267   return atomic_sub_int(x, y);
  7270   return atomic_sub_float(x, y);
  7273 unsigned int atomic_and_unsigned(
unsigned int *p, 
unsigned int val);
  7274 int atomic_and_int(
int *p, 
int val);
  7275 uint64_t atomic_and_uint64(uint64_t *p, uint64_t val);
  7277 static inline unsigned int atomic_fetch_and(
unsigned int *x, 
unsigned int y) __CPU__ __HC__ {
  7278   return atomic_and_unsigned(x, y);
  7281   return atomic_and_int(x, y);
  7283 static inline uint64_t 
atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__ {
  7284   return atomic_and_uint64(x, y);
  7287 unsigned int atomic_or_unsigned(
unsigned int *p, 
unsigned int val);
  7288 int atomic_or_int(
int *p, 
int val);
  7289 uint64_t atomic_or_uint64(uint64_t *p, uint64_t val);
  7291 static inline unsigned int atomic_fetch_or(
unsigned int *x, 
unsigned int y) __CPU__ __HC__ {
  7292   return atomic_or_unsigned(x, y);
  7295   return atomic_or_int(x, y);
  7297 static inline uint64_t 
atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__ {
  7298   return atomic_or_uint64(x, y);
  7301 unsigned int atomic_xor_unsigned(
unsigned int *p, 
unsigned int val);
  7302 int atomic_xor_int(
int *p, 
int val);
  7303 uint64_t atomic_xor_uint64(uint64_t *p, uint64_t val);
  7305 static inline unsigned int atomic_fetch_xor(
unsigned int *x, 
unsigned int y) __CPU__ __HC__ {
  7306   return atomic_xor_unsigned(x, y);
  7309   return atomic_xor_int(x, y);
  7311 static inline uint64_t 
atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__ {
  7312   return atomic_xor_uint64(x, y);
  7328 extern unsigned atomic_fetch_or(
unsigned *x, 
unsigned y) __CPU__ __HC__;
  7330 extern uint64_t 
atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__;
  7337 #if __KALMAR_ACCELERATOR__ == 1  7338 extern "C" unsigned int atomic_max_unsigned(
unsigned int *p, 
unsigned int val) __HC__;
  7339 extern "C" int atomic_max_int(
int *p, 
int val) __HC__;
  7340 extern "C" uint64_t atomic_max_uint64(uint64_t *p, uint64_t val) __HC__;
  7342 static inline unsigned int atomic_fetch_max(
unsigned int *x, 
unsigned int y) __HC__ {
  7343   return atomic_max_unsigned(x, y);
  7346   return atomic_max_int(x, y);
  7349   return atomic_max_uint64(x, y);
  7352 extern "C" unsigned int atomic_min_unsigned(
unsigned int *p, 
unsigned int val) __HC__;
  7353 extern "C" int atomic_min_int(
int *p, 
int val) __HC__;
  7354 extern "C" uint64_t atomic_min_uint64(uint64_t *p, uint64_t val) __HC__;
  7356 static inline unsigned int atomic_fetch_min(
unsigned int *x, 
unsigned int y) __HC__ {
  7357   return atomic_min_unsigned(x, y);
  7360   return atomic_min_int(x, y);
  7363   return atomic_min_uint64(x, y);
  7365 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7366 unsigned int atomic_max_unsigned(
unsigned int *p, 
unsigned int val);
  7367 int atomic_max_int(
int *p, 
int val);
  7368 uint64_t atomic_max_uint64(uint64_t *p, uint64_t val);
  7370 static inline unsigned int atomic_fetch_max(
unsigned int *x, 
unsigned int y) __HC__ {
  7371   return atomic_max_unsigned(x, y);
  7374   return atomic_max_int(x, y);
  7377   return atomic_max_uint64(x, y);
  7380 unsigned int atomic_min_unsigned(
unsigned int *p, 
unsigned int val);
  7381 int atomic_min_int(
int *p, 
int val);
  7382 uint64_t atomic_min_uint64(uint64_t *p, uint64_t val);
  7384 static inline unsigned int atomic_fetch_min(
unsigned int *x, 
unsigned int y) __HC__ {
  7385   return atomic_min_unsigned(x, y);
  7388   return atomic_min_int(x, y);
  7391   return atomic_min_uint64(x, y);
  7395 extern unsigned int atomic_fetch_max(
unsigned int * dest, 
unsigned int val) __CPU__ __HC__;
  7396 extern uint64_t 
atomic_fetch_max(uint64_t * dest, uint64_t val) __CPU__ __HC__;
  7399 extern unsigned int atomic_fetch_min(
unsigned int * dest, 
unsigned int val) __CPU__ __HC__;
  7400 extern uint64_t 
atomic_fetch_min(uint64_t * dest, uint64_t val) __CPU__ __HC__;
  7418 #if __KALMAR_ACCELERATOR__ == 1  7419 extern "C" unsigned int atomic_inc_unsigned(
unsigned int *p) __HC__;
  7420 extern "C" int atomic_inc_int(
int *p) __HC__;
  7422 static inline unsigned int atomic_fetch_inc(
unsigned int *x) __CPU__ __HC__ {
  7423   return atomic_inc_unsigned(x);
  7426   return atomic_inc_int(x);
  7429 extern "C" unsigned int atomic_dec_unsigned(
unsigned int *p) __HC__;
  7430 extern "C" int atomic_dec_int(
int *p) __HC__;
  7432 static inline unsigned int atomic_fetch_dec(
unsigned int *x) __CPU__ __HC__ {
  7433   return atomic_dec_unsigned(x);
  7436   return atomic_dec_int(x);
  7438 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7439 unsigned int atomic_inc_unsigned(
unsigned int *p);
  7440 int atomic_inc_int(
int *p);
  7442 static inline unsigned int atomic_fetch_inc(
unsigned int *x) __CPU__ __HC__ {
  7443   return atomic_inc_unsigned(x);
  7446   return atomic_inc_int(x);
  7449 unsigned int atomic_dec_unsigned(
unsigned int *p);
  7450 int atomic_dec_int(
int *p);
  7452 static inline unsigned int atomic_fetch_dec(
unsigned int *x) __CPU__ __HC__ {
  7453   return atomic_dec_unsigned(x);
  7456   return atomic_dec_int(x);
  7478 extern "C" unsigned int __atomic_wrapinc(
unsigned int* address, 
unsigned int val) __HC__;
  7490 extern "C" unsigned int __atomic_wrapdec(
unsigned int* address, 
unsigned int val) __HC__;
  7497 template <
int N, 
typename Kernel>
  7500 template <
typename Kernel>
  7503 template <
typename Kernel>
  7506 template <
typename Kernel>
  7509 template <
int N, 
typename Kernel>
  7511     return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
  7514 template <
typename Kernel>
  7516     return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
  7519 template <
typename Kernel>
  7521     return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
  7524 template <
typename Kernel>
  7526     return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
  7529 template <
int N, 
typename Kernel, 
typename _Tp>
  7532     static inline void call(Kernel& k, _Tp& idx) __CPU__ __HC__ {
  7534         for (i = 0; i < k.ext[N - 1]; ++i) {
  7540 template <
typename Kernel, 
typename _Tp>
  7543     static inline void call(Kernel& k, _Tp& idx) __CPU__ __HC__ {
  7544 #if __KALMAR_ACCELERATOR__ == 1  7550 template <
int N, 
typename Kernel>
  7555         : ext(other), k(f) {}
  7556     void operator() (
index<N> idx) __CPU__ __HC__ {
  7562     template <
int K, 
typename Ker, 
typename _Tp>
  7566 #pragma clang diagnostic push  7567 #pragma clang diagnostic ignored "-Wreturn-type"  7568 #pragma clang diagnostic ignored "-Wunused-variable"  7570 template <
int N, 
typename Kernel>
  7573     const extent<N>& compute_domain, 
const Kernel& f) __CPU__ __HC__ {
  7574 #if __KALMAR_ACCELERATOR__ != 1  7575     for(
int i = 0 ; i < N ; i++)
  7578       if (compute_domain[i] == 0)
  7580       if (compute_domain[i] < 0)
  7582       if (static_cast<size_t>(compute_domain[i]) > 4294967295L)
  7585     size_t ext[3] = {
static_cast<size_t>(compute_domain[N - 1]),
  7586         static_cast<size_t>(compute_domain[N - 2]),
  7587         static_cast<size_t>(compute_domain[N - 3])};
  7588 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7590         return launch_cpu_task_async(av.pQueue, f, compute_domain);
  7599 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7600     int* foo1 = 
reinterpret_cast<int*
>(&Kernel::__cxxamp_trampoline);
  7607 #pragma clang diagnostic pop  7609 #pragma clang diagnostic push  7610 #pragma clang diagnostic ignored "-Wreturn-type"  7611 #pragma clang diagnostic ignored "-Wunused-variable"  7613 template <
typename Kernel>
  7616 #if __KALMAR_ACCELERATOR__ != 1  7618   if (compute_domain[0] == 0)
  7620   if (compute_domain[0] < 0) {
  7623   if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
  7625 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7627         return launch_cpu_task_async(av.pQueue, f, compute_domain);
  7630   size_t ext = compute_domain[0];
  7634   return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 1>(av.pQueue, &ext, NULL, f));
  7635 #else //if __KALMAR_ACCELERATOR__ != 1  7638   auto foo = &Kernel::__cxxamp_trampoline;
  7639   auto bar = &Kernel::operator();
  7642 #pragma clang diagnostic pop  7644 #pragma clang diagnostic push  7645 #pragma clang diagnostic ignored "-Wreturn-type"  7646 #pragma clang diagnostic ignored "-Wunused-variable"  7648 template <
typename Kernel>
  7651 #if __KALMAR_ACCELERATOR__ != 1  7653   if (compute_domain[0] == 0 || compute_domain[1] == 0)
  7655   if (compute_domain[0] < 0 || compute_domain[1] < 0) {
  7658   if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
  7660   if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
  7662 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7664         return launch_cpu_task_async(av.pQueue, f, compute_domain);
  7667   size_t ext[2] = {
static_cast<size_t>(compute_domain[1]),
  7668                    static_cast<size_t>(compute_domain[0])};
  7672   return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 2>(av.pQueue, ext, NULL, f));
  7673 #else //if __KALMAR_ACCELERATOR__ != 1  7676   auto foo = &Kernel::__cxxamp_trampoline;
  7677   auto bar = &Kernel::operator();
  7680 #pragma clang diagnostic pop  7682 #pragma clang diagnostic push  7683 #pragma clang diagnostic ignored "-Wreturn-type"  7684 #pragma clang diagnostic ignored "-Wunused-variable"  7686 template <
typename Kernel>
  7689 #if __KALMAR_ACCELERATOR__ != 1  7691   if (compute_domain[0] == 0 || compute_domain[1] == 0 || compute_domain[2] == 0)
  7693   if (compute_domain[0] < 0 || compute_domain[1] < 0 || compute_domain[2] < 0) {
  7696   if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
  7698   if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
  7700   if (static_cast<size_t>(compute_domain[2]) > 4294967295L)
  7702 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7704         return launch_cpu_task_async(av.pQueue, f, compute_domain);
  7707   size_t ext[3] = {
static_cast<size_t>(compute_domain[2]),
  7708                    static_cast<size_t>(compute_domain[1]),
  7709                    static_cast<size_t>(compute_domain[0])};
  7713   return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 3>(av.pQueue, ext, NULL, f));
  7714 #else //if __KALMAR_ACCELERATOR__ != 1  7717   auto foo = &Kernel::__cxxamp_trampoline;
  7718   auto bar = &Kernel::operator();
  7721 #pragma clang diagnostic pop  7723 #pragma clang diagnostic push  7724 #pragma clang diagnostic ignored "-Wreturn-type"  7725 #pragma clang diagnostic ignored "-Wunused-variable"  7727 template <
typename Kernel>
  7730 #if __KALMAR_ACCELERATOR__ != 1  7732   if (compute_domain[0] == 0)
  7734   if (compute_domain[0] < 0) {
  7737   if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
  7739   size_t ext = compute_domain[0];
  7740   size_t tile = compute_domain.
tile_dim[0];
  7741 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7743       return launch_cpu_task_async(av.pQueue, f, compute_domain);
  7749   void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
  7751 #else //if __KALMAR_ACCELERATOR__ != 1  7755   auto foo = &Kernel::__cxxamp_trampoline;
  7756   auto bar = &Kernel::operator();
  7759 #pragma clang diagnostic pop  7761 #pragma clang diagnostic push  7762 #pragma clang diagnostic ignored "-Wreturn-type"  7763 #pragma clang diagnostic ignored "-Wunused-variable"  7765 template <
typename Kernel>
  7768 #if __KALMAR_ACCELERATOR__ != 1  7770   if (compute_domain[0] == 0 || compute_domain[1] == 0)
  7772   if (compute_domain[0] < 0 || compute_domain[1] < 0) {
  7775   if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
  7777   if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
  7779   size_t ext[2] = { 
static_cast<size_t>(compute_domain[1]),
  7780                     static_cast<size_t>(compute_domain[0])};
  7781   size_t tile[2] = { 
static_cast<size_t>(compute_domain.
tile_dim[1]),
  7782                      static_cast<size_t>(compute_domain.
tile_dim[0]) };
  7783 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7785       return launch_cpu_task_async(av.pQueue, f, compute_domain);
  7791   void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
  7793 #else //if __KALMAR_ACCELERATOR__ != 1  7797   auto foo = &Kernel::__cxxamp_trampoline;
  7798   auto bar = &Kernel::operator();
  7801 #pragma clang diagnostic pop  7803 #pragma clang diagnostic push  7804 #pragma clang diagnostic ignored "-Wreturn-type"  7805 #pragma clang diagnostic ignored "-Wunused-variable"  7807 template <
typename Kernel>
  7810 #if __KALMAR_ACCELERATOR__ != 1  7812   if (compute_domain[0] == 0 || compute_domain[1] == 0 || compute_domain[2] == 0)
  7814   if (compute_domain[0] < 0 || compute_domain[1] < 0 || compute_domain[2] < 0) {
  7817   if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
  7819   if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
  7821   if (static_cast<size_t>(compute_domain[2]) > 4294967295L)
  7823   size_t ext[3] = { 
static_cast<size_t>(compute_domain[2]),
  7824                     static_cast<size_t>(compute_domain[1]),
  7825                     static_cast<size_t>(compute_domain[0])};
  7826   size_t tile[3] = { 
static_cast<size_t>(compute_domain.
tile_dim[2]),
  7827                      static_cast<size_t>(compute_domain.
tile_dim[1]),
  7828                      static_cast<size_t>(compute_domain.
tile_dim[0]) };
  7829 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2  7831       return launch_cpu_task_async(av.pQueue, f, compute_domain);
  7837   void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
  7839 #else //if __KALMAR_ACCELERATOR__ != 1  7843   auto foo = &Kernel::__cxxamp_trampoline;
  7844   auto bar = &Kernel::operator();
  7847 #pragma clang diagnostic pop unsigned int __sad_u32_u16x2(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Computes the sum of the absolute differences of src0 and src1 and then adds src2 to the result...
 
uint64_t __pack_u8x8_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
 
The partial specialization array_view<const T,N> represents a view over elements of type const T with...
Definition: hc.hpp:5762
 
array_view(const extent< N > &ext)
Constructs an array_view which is not bound to a data source. 
Definition: hc.hpp:5211
 
bool get_is_debug() const 
Returns a boolean value indicating whether the accelerator_view supports debugging through extensive ...
Definition: hc.hpp:197
 
This class is the return type of all asynchronous APIs and has an interface analogous to std::shared_...
Definition: hc.hpp:1130
 
array(int e0, int e1, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory. 
Definition: hc.hpp:4387
 
uint64_t __unpackhi_u8x8(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation. 
 
void * get_hsa_agent()
Returns an opaque handle which points to the underlying HSA agent. 
Definition: hc.hpp:478
 
void flush()
Sends the queued up commands in the accelerator_view to the device for execution. ...
Definition: hc.hpp:241
 
array_view(const extent< N > &ext, value_type *src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" container. 
Definition: hc.hpp:5194
 
extent(const extent &other) __CPU__ __HC__
Copy constructor. 
Definition: hc.hpp:1604
 
array_view< const T, N > section(const extent< N > &ext) const __CPU__ __HC__
Equivalent to "section(index<N>(), ext)". 
Definition: hc.hpp:6205
 
int64_t __unpacklo_s8x8(int64_t src0, int64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation. 
 
const T value_type
The element type of this array. 
Definition: hc.hpp:5781
 
int64_t __bitextract_s64(int64_t src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits. 
 
tiled_extent(const extent< 1 > &ext, int t0) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext". 
Definition: hc.hpp:2087
 
array_view< const ElementType, N > reinterpret_as() const __CPU__ __HC__
This member function is similar to "array<T,N>::reinterpret_as", although it only supports array_view...
Definition: hc.hpp:6249
 
unsigned int __lerp_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Do linear interpolation and computes the unsigned 8-bit average of packed data. 
 
const T & operator()(const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array_view that is at the location in N-dimensional ...
Definition: hc.hpp:6095
 
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)". 
Definition: hc.hpp:4649
 
void set_dynamic_group_segment_size(unsigned int size) __CPU__
Set the size of dynamic group segment. 
Definition: hc.hpp:2200
 
const index< 2 > tile_dim
An index of rank 1, 2, 3 that represents the size of the tile. 
Definition: hc.hpp:3689
 
float __amdgcn_wave_sl1(float src, bool bound_ctrl)[[hc]]
Direct copy from indexed active work-item within a wavefront. 
Definition: hc.hpp:2943
 
array & operator=(array &&other)
Moves the contents of the array "other" to this array. 
Definition: hc.hpp:4705
 
array(int e0, int e1)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))". 
Definition: hc.hpp:4276
 
array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)". 
Definition: hc.hpp:4643
 
bool get_is_auto_selection()
Returns a boolean value indicating whether the accelerator view when passed to a parallel_for_each wo...
Definition: hc.hpp:171
 
void * get_hsa_am_finegrained_system_region()
Returns an opaque handle which points to the AM system region on the HSA agent. 
Definition: hc.hpp:515
 
array_view(int e0, Container &src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5225
 
tiled_extent(const extent< 2 > &ext, int t0, int t1, int size) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext". 
Definition: hc.hpp:2192
 
const index< 3 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: hc.hpp:3468
 
bool operator==(const extent &other) const __CPU__ __HC__
Compares two objects of extent<N>. 
Definition: hc.hpp:1738
 
tiled_index(const tiled_index &other) __CPU__ __HC__
Copy constructor. 
Definition: hc.hpp:3456
 
std::vector< accelerator > get_peers() const 
Return a std::vector of this accelerator's peers. 
Definition: hc.hpp:1070
 
array(const array_view< const T, N > &src, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array initialized with the contents of the array_view "src". 
Definition: hc.hpp:4500
 
Represents an extent subdivided into tiles. 
Definition: hc.hpp:2218
 
uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
 
void * get_hsa_am_system_region()
Returns an opaque handle which points to the AM system region on the HSA agent. 
Definition: hc.hpp:503
 
array_view & operator=(const array_view &other) __CPU__ __HC__
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: hc.hpp:5299
 
array(const extent< N > &ext, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array with the supplied extent, located on the accelerator bound to the accelerator_...
Definition: hc.hpp:4374
 
unsigned int get_dynamic_group_segment_size() const __CPU__
Return the size of dynamic group segment in bytes. 
Definition: hc.hpp:2309
 
uint64_t get_begin_tick()
Get the tick number when the underlying asynchronous operation begins. 
Definition: hc.hpp:1316
 
size_t get_dedicated_memory() const 
Returns the amount of dedicated memory (in KB) on an accelerator device. 
Definition: hc.hpp:907
 
tiled_extent(const tiled_extent &other) __CPU__ __HC__
Copy constructor. 
Definition: hc.hpp:2018
 
float __shfl_up(float var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__
Copy from an active work-item with lower ID relative to caller within a wavefront. 
Definition: hc.hpp:3062
 
array_view< const T, N > section(const index< N > &idx, const extent< N > &ext) const __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:6187
 
float __unpack_f32_f32x2(double src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0. 
 
completion_future synchronize_async() const 
An asynchronous version of synchronize, which returns a completion future object. ...
Definition: hc.hpp:5425
 
unsigned int __unpack_u32_u32x2(uint64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0. 
 
int __unpack_s32_s16x4(int64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0. 
 
tiled_extent() __CPU__ __HC__
Default constructor. 
Definition: hc.hpp:2141
 
tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2, int size) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2264
 
int __unpacklo_s16x2(int src0, int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation. 
 
bool operator!=(const accelerator &other) const 
Compares "this" accelerator with the passed accelerator object to determine if they represent differe...
Definition: hc.hpp:847
 
array & operator=(const array &other)
Assigns the contents of the array "other" to this array, using a deep copy. 
Definition: hc.hpp:4690
 
uint64_t get_tick_frequency()
Get the frequency of ticks per second for the underlying asynchrnous operation. 
Definition: hc.hpp:103
 
accelerator_view get_associated_accelerator_view() const 
This property returns the accelerator_view representing the preferred target where this array can be ...
Definition: hc.hpp:4675
 
accelerator_view get_source_accelerator_view() const 
Access the accelerator_view where the data source of the array_view is located. 
Definition: hc.hpp:5908
 
unsigned int __lastbit_u32_u32(unsigned int input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit. 
Definition: hc.hpp:2535
 
float __amdgcn_wave_rl1(float src)[[hc]]
Direct copy from indexed active work-item within a wavefront. 
Definition: hc.hpp:2982
 
void wait() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3330
 
The tile_barrier class is a capability class that is only creatable by the system, and passed to a tiled parallel_for_each function object as part of the tiled_index parameter. 
Definition: hc.hpp:3294
 
index< N > operator/(const index< N > &idx, int value)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:559
 
array_view< T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))". 
Definition: hc.hpp:5649
 
accelerator()
Constructs a new accelerator object that represents the default accelerator. 
Definition: hc.hpp:713
 
array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)". 
Definition: hc.hpp:4572
 
The array_view<T,N> type represents a possibly cached view into the data held in an array<T...
Definition: hc.hpp:60
 
unsigned int __firstbit_u32_u32(unsigned int input) __HC__
Count leading zero bits in the input. 
Definition: hc.hpp:2477
 
extent & operator*=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent. 
Definition: hc.hpp:1763
 
int tile_dim[3]
Tile size for each dimension. 
Definition: hc.hpp:2231
 
tiled_index(const tiled_index &other) __CPU__ __HC__
Copy constructor. 
Definition: hc.hpp:3655
 
array(const extent< N > &ext, InputIter srcBegin)
Constructs a new array with the supplied extent, located on the default accelerator, initialized with the contents of a source container specified by a beginning and optional ending iterator. 
Definition: hc.hpp:4298
 
void * get_hsa_agent() const 
Returns an opaque handle which points to the underlying HSA agent. 
Definition: hc.hpp:1051
 
uint64_t __pack_u16x4_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
 
array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4523
 
bool operator!=(const extent &other) const __CPU__ __HC__
Compares two objects of extent<N>. 
Definition: hc.hpp:1741
 
unsigned int __bitrev_b32(unsigned int src0)[[hc]] __asm("llvm.bitreverse.i32")
Reverse the bits. 
 
void global_memory_fence(const tile_barrier &) __HC__
Establishes a thread-tile scoped memory fence for global (but not tile-static) memory operations...
 
int64_t __unpackhi_s16x4(int64_t src0, int64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation. 
 
array_view< T, 2 > section(int i0, int i1, int e0, int e1) const __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:4997
 
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)". 
Definition: hc.hpp:4326
 
void * get_hsa_am_finegrained_system_region() const 
Returns an opaque handle which points to the AM system region on the HSA agent. 
Definition: hc.hpp:1001
 
T * accelerator_pointer() const __CPU__ __HC__
Returns a pointer to the device memory underlying this array_view. 
Definition: hc.hpp:5367
 
const index< 1 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: hc.hpp:3581
 
Represents an extent subdivided into tiles. 
Definition: hc.hpp:2031
 
bool get_is_empty()
Returns true if the accelerator_view is currently empty. 
Definition: hc.hpp:458
 
tiled_extent(const tiled_extent< 2 > &other) __CPU__ __HC__
Copy constructor. 
Definition: hc.hpp:2173
 
void synchronize_to(const accelerator_view &av) const 
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:6056
 
array_view & operator=(const array_view< T, N > &other) __CPU__ __HC__
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: hc.hpp:5919
 
void copy_to(const array_view< T, N > &dest) const 
Copies the contents of this array to the array_view given by "dest", as if by calling "copy(*this...
Definition: hc.hpp:4753
 
extent & operator*=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  value. 
Definition: hc.hpp:1824
 
const index< 2 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: hc.hpp:3679
 
extent() __CPU__ __HC__
Default constructor. 
Definition: hc.hpp:1594
 
array_view(const extent< N > &extent, Container &src)
Constructs an array_view which is bound to the data contained in the "src" container. 
Definition: hc.hpp:5180
 
Represents a unique position in N-dimensional space. 
Definition: hc.hpp:58
 
uint64_t atomic_fetch_min(uint64_t *dest, uint64_t val) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
 
void * get_hsa_kernarg_region()
Returns an opaque handle which points to the Kernarg region on the HSA agent. 
Definition: hc.hpp:526
 
extent & operator+=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent. 
Definition: hc.hpp:1755
 
accelerator & operator=(const accelerator &other)
Assigns an accelerator object to "this" accelerator object and returns a reference to "this" object...
Definition: hc.hpp:802
 
tiled_extent(const extent< 3 > &ext, int t0, int t1, int t2, int size) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext". 
Definition: hc.hpp:2294
 
array_view< T, N > section(const index< N > &origin, const extent< N > &ext) __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:4930
 
tiled_extent(const tiled_extent< 1 > &other) __CPU__ __HC__
Copy constructor. 
Definition: hc.hpp:2078
 
int __amdgcn_mbcnt_lo(int mask, int src)[[hc]] __asm("llvm.amdgcn.mbcnt.lo")
Direct copy from indexed active work-item within a wavefront. 
 
void synchronize() const 
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:6028
 
unsigned int get_version() const 
Returns a 32-bit unsigned integer representing the version number of this accelerator. 
Definition: hc.hpp:890
 
int __mul24(int x, int y)[[hc]]
Multiply two integers (x,y) but only the lower 24 bits will be used in the multiplication. 
Definition: hc.hpp:3184
 
bool get_is_peer(const accelerator &other) const 
Check if other is peer of this accelerator. 
Definition: hc.hpp:1061
 
T & operator()(int i0, int i1) __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))". 
Definition: hc.hpp:4852
 
int64_t __unpacklo_s32x2(int64_t src0, int64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation. 
 
float __shfl(float var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__
Direct copy from indexed active work-item within a wavefront. 
Definition: hc.hpp:3014
 
array_view< const T, K > view_as(extent< K > viewExtent) const __CPU__ __HC__
This member function is similar to "array<T,N>::view_as", although it only supports array_views of ra...
Definition: hc.hpp:6272
 
void wait_with_tile_static_memory_fence() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3382
 
Definition: kalmar_exception.h:51
 
const index< 2 > global
An index of rank 1, 2, or 3 that represents the global index within an extent. 
Definition: hc.hpp:3661
 
const index< 3 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: hc.hpp:3474
 
array_projection_helper< T, N >::result_type operator()(int i0) __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const". 
Definition: hc.hpp:4898
 
unsigned int atomic_fetch_inc(unsigned int *_Dest) __CPU__ __HC__
Atomically increment or decrement the value stored at the location point to by dest. 
 
static bool set_default(const std::wstring &path)
Sets the default accelerator to the device path identified by the "path" argument. 
Definition: hc.hpp:770
 
uint64_t __clock_u64() __HC__
Get system timestamp. 
 
unsigned int __popcount_u32_b32(unsigned int input) __HC__
Count number of 1 bits in the input. 
Definition: hc.hpp:2389
 
void wait_with_all_memory_fence() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3347
 
uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits. 
 
array_view< const T, N > section(const index< N > &origin, const extent< N > &ext) const __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:4938
 
array_view< const T, N > section(const index< N > &idx) const __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)". 
Definition: hc.hpp:6196
 
void * get_hsa_kernarg_region() const 
Returns an opaque handle which points to the Kernarg region on the HSA agent. 
Definition: hc.hpp:1012
 
void * get_group_segment_base_pointer() __HC__
Fetch the address of the beginning of group segment. 
 
unsigned int get_cu_count() const 
Return the compute unit count of the accelerator. 
Definition: hc.hpp:1087
 
array_view< T, N > section(const extent< N > &ext) const __CPU__ __HC__
Equivalent to "section(index<N>(), ext)". 
Definition: hc.hpp:5624
 
unsigned int __sad_u32_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Computes the sum of the absolute differences of src0 and src1 and then adds src2 to the result...
 
void * get_hsa_am_region() const 
Returns an opaque handle which points to the AM region on the HSA agent. 
Definition: hc.hpp:977
 
array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4520
 
extent & operator+=(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent. 
Definition: hc.hpp:1796
 
void copy_to(array< T, N > &dest) const 
Copies the data referred to by this array_view to the array given by "dest", as if by calling "copy(*...
Definition: hc.hpp:5317
 
T & operator()(const index< N > &idx) __CPU__ __HC__
Returns a reference to the element of this array that is at the location in N-dimensional space speci...
Definition: hc.hpp:4812
 
namespace for internal classes of Kalmar compiler / runtime 
Definition: hc.hpp:42
 
const tile_barrier barrier
An object which represents a barrier within the current tile of threads. 
Definition: hc.hpp:3586
 
array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)". 
Definition: hc.hpp:4652
 
void wait_with_global_memory_fence() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3364
 
float __amdgcn_ds_swizzle(float src, int pattern)[[hc]]
Direct copy from indexed active work-item within a wavefront. 
Definition: hc.hpp:2896
 
const index< 3 > tile_dim
An index of rank 1, 2, 3 that represents the size of the tile. 
Definition: hc.hpp:3490
 
T * accelerator_pointer() const __CPU__ __HC__
Returns a pointer to the device memory underlying this array_view. 
Definition: hc.hpp:5988
 
array(int e0, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory. 
Definition: hc.hpp:4385
 
unsigned int __sad_u32_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Computes the sum of the absolute differences of src0 and src1 and then adds src2 to the result...
 
int64_t __unpackhi_s8x8(int64_t src0, int64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation. 
 
std::wstring get_device_path() const 
Returns a system-wide unique device instance path that matches the "Device Instance Path" property fo...
Definition: hc.hpp:877
 
unsigned int size() const __CPU__ __HC__
This member function returns the total linear size of this extent<N> (in units of elements)...
Definition: hc.hpp:1695
 
void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, const void *args, size_t argsize, hc::completion_future *cf=nullptr, const char *kernel_name=nullptr)
Dispatch a kernel into the accelerator_view. 
Definition: hc.hpp:597
 
array_view(const extent< N > &ext, const value_type *src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" container. 
Definition: hc.hpp:5830
 
unsigned int __unpacklo_u16x2(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation. 
 
array(int e0, int e1, int e2)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))". 
Definition: hc.hpp:4278
 
unsigned int __lastbit_u32_u64(unsigned long long int input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit. 
Definition: hc.hpp:2539
 
void * get_dynamic_group_segment_base_pointer() __HC__
Fetch the address of the beginning of dynamic group segment. 
 
void discard_data() const 
Indicates to the runtime that it may discard the current logical contents of this array_view...
Definition: hc.hpp:5495
 
queuing_mode get_queuing_mode() const 
Returns the queuing mode that this accelerator_view was created with. 
Definition: hc.hpp:152
 
completion_future synchronize_async() const 
An asynchronous version of synchronize, which returns a completion future object. ...
Definition: hc.hpp:6040
 
tiled_extent() __CPU__ __HC__
Default constructor. 
Definition: hc.hpp:2237
 
const index< 1 > global
An index of rank 1, 2, or 3 that represents the global index within an extent. 
Definition: hc.hpp:3563
 
uint64_t __bitselect_b64(uint64_t src0, uint64_t src1, uint64_t src2) __HC__
Do bit field selection. 
 
double __pack_f32x2_f32(double src0, float src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
 
array_view< T, N > section(const index< N > &idx) const __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)". 
Definition: hc.hpp:5615
 
accelerator_view & operator=(const accelerator_view &other)
Assigns an accelerator_view object to "this" accelerator_view object and returns a reference to "this...
Definition: hc.hpp:141
 
array_view(const array_view &other) __CPU__ __HC__
Copy constructor. 
Definition: hc.hpp:5891
 
int value_type
The element type of extent<N>. 
Definition: hc.hpp:1588
 
int tile_dim[2]
Tile size for each dimension. 
Definition: hc.hpp:2135
 
array(int e0)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))". 
Definition: hc.hpp:4274
 
void refresh() const 
Calling this member function informs the array_view that its bound memory has been modified outside t...
Definition: hc.hpp:5997
 
uint64_t atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__
Atomically read the value stored in dest , replace it with the value given in val and return the old ...
 
bool has_cpu_accessible_am()
Return true if the accelerator's memory can be mapped into the CPU's address space, and the CPU is allowed to access the memory directly with CPU memory operations. 
Definition: hc.hpp:1106
 
array_view< const T, N > section(const index< N > &idx) const __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)". 
Definition: hc.hpp:4957
 
accelerator get_accelerator() const 
Returns the accelerator that this accelerator_view has been created on. 
Definition: hc.hpp:1461
 
extent< N > get_extent() const __CPU__ __HC__
Access the extent that defines the shape of this array. 
Definition: hc.hpp:4663
 
extent & operator%=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  value. 
Definition: hc.hpp:1832
 
tiled_extent(int e0, int e1, int t0, int t1) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2152
 
tiled_extent() __CPU__ __HC__
Default constructor. 
Definition: hc.hpp:2009
 
Definition: kalmar_exception.h:22
 
uint64_t atomic_fetch_max(uint64_t *dest, uint64_t val) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
 
int __amdgcn_mbcnt_hi(int mask, int src)[[hc]] __asm("llvm.amdgcn.mbcnt.hi")
Direct copy from indexed active work-item within a wavefront. 
 
accelerator_view(const accelerator_view &other)
Copy-constructs an accelerator_view object. 
Definition: hc.hpp:129
 
extent & operator-=(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent. 
Definition: hc.hpp:1800
 
array_view< const T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))". 
Definition: hc.hpp:6230
 
array_view< const T, K > view_as(const extent< K > &viewExtent) const __CPU__ __HC__
An array of higher rank can be reshaped into an array of lower rank, or vice versa, using the view_as member function. 
Definition: hc.hpp:5087
 
accelerator_view get_source_accelerator_view() const 
Access the accelerator_view where the data source of the array_view is located. 
Definition: hc.hpp:5289
 
void wait(hcWaitMode mode=hcWaitModeBlocked) const 
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: hc.hpp:1235
 
array(const array &other)
Copy constructor. 
Definition: hc.hpp:4242
 
index< N > operator-(const index< N > &lhs, const index< N > &rhs)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:498
 
unsigned int get_static_group_segment_size() __HC__
Fetch the size of static group segment. 
 
array_view< ElementType, N > reinterpret_as() const __CPU__ __HC__
This member function is similar to "array<T,N>::reinterpret_as", although it only supports array_view...
Definition: hc.hpp:5668
 
unsigned int get_dynamic_group_segment_size() const __CPU__
Return the size of dynamic group segment in bytes. 
Definition: hc.hpp:2207
 
uint64_t __unpacklo_u8x8(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation. 
 
hcAgentProfile get_profile() const 
Returns the profile the accelerator. 
Definition: hc.hpp:1029
 
tiled_extent() __CPU__ __HC__
Default constructor. 
Definition: hc.hpp:2050
 
void copy_to(const array_view< T, N > &dest) const 
Copies the contents of this array_view to the array_view given by "dest", as if by calling "copy(*thi...
Definition: hc.hpp:5957
 
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles. 
Definition: hc.hpp:3441
 
completion_future(const completion_future &other)
Copy constructor. 
Definition: hc.hpp:1147
 
array_view< const T, 1 > section(int i0, int e0) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))". 
Definition: hc.hpp:6220
 
accelerator_view get_default_view() const 
Returns the default accelerator_view associated with the accelerator. 
Definition: hc.hpp:813
 
extent(const int components[]) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided the array of int component values...
Definition: hc.hpp:1636
 
array_view(array< T, N > &src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" array. ...
Definition: hc.hpp:5160
 
accelerator(const accelerator &other)
Copy constructs an accelerator object. 
Definition: hc.hpp:740
 
array(int e0, int e1, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)". 
Definition: hc.hpp:4570
 
unsigned int __unpackhi_u8x4(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation. 
 
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4532
 
const tile_barrier barrier
An object which represents a barrier within the current tile of threads. 
Definition: hc.hpp:3485
 
const T & operator[](const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array that is at the location in N-dimensional space...
Definition: hc.hpp:4829
 
unsigned int __bitselect_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Do bit field selection. 
 
array(const extent< N > &ext, accelerator_view av, void *accelerator_pointer, access_type cpu_access_type=access_type_auto)
Constructs an array instance based on the given pointer on the device memory. 
Definition: hc.hpp:4405
 
void copy_to(array< T, N > &dest) const 
Copies the data referred to by this array_view to the array given by "dest", as if by calling "copy(*...
Definition: hc.hpp:5948
 
bool contains(const index< N > &idx) const __CPU__ __HC__
Tests whether the index "idx" is properly contained within this extent (with an assumed origin of zer...
Definition: hc.hpp:1686
 
uint64_t __activelanemask_v4_b64_b1(unsigned int input) __HC__
Return a bit mask shows which active work-items in the wavefront have a non-zero input. 
 
const tile_barrier barrier
An object which represents a barrier within the current tile of threads. 
Definition: hc.hpp:3684
 
unsigned int __unpack_u32_u8x8(uint64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0. 
 
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles. 
Definition: hc.hpp:3542
 
void copy(const void *src, void *dst, size_t size_bytes)
Copies size_bytes bytes from src to dst. 
Definition: hc.hpp:343
 
void copy(const array_view< const T, 1 > &src, const array_view< T, 1 > &dest)
The contents of "src" are copied into "dest". 
Definition: hc.hpp:6694
 
extent operator-(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent. 
Definition: hc.hpp:1791
 
extent operator--(int) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  1. 
Definition: hc.hpp:1860
 
const index< 1 > tile_dim
An index of rank 1, 2, 3 that represents the size of the tile. 
Definition: hc.hpp:3591
 
array_view< T, 1 > section(int i0, int e0) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))". 
Definition: hc.hpp:5639
 
bool set_default_cpu_access_type(access_type type)
Sets the default_cpu_access_type for this accelerator. 
Definition: hc.hpp:867
 
const T * data() const __CPU__ __HC__
Returns a pointer to the first data element underlying this array_view. 
Definition: hc.hpp:5974
 
completion_future()
Default constructor. 
Definition: hc.hpp:1138
 
const T & operator()(const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array that is at the location in N-dimensional space...
Definition: hc.hpp:4838
 
std::wstring get_description() const 
Returns a short textual description of the accelerator device. 
Definition: hc.hpp:882
 
Definition: kalmar_exception.h:42
 
T value_type
The element type of this array. 
Definition: hc.hpp:5145
 
T & operator[](const index< N > &idx) __CPU__ __HC__
Returns a reference to the element of this array that is at the location in N-dimensional space speci...
Definition: hc.hpp:4803
 
unsigned int __bitmask_b32(unsigned int src0, unsigned int src1) __HC__
Create a bit mask that can be used with bitselect. 
 
unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits. 
 
array_view< T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:5005
 
array_view(int e0, const value_type *src) __CPU__ __HC__
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5866
 
void * get_native_handle() const 
Get the native handle for the asynchronous operation encapsulated in this completion_future object...
Definition: hc.hpp:1302
 
unsigned int __packcvt_u8x4_f32(float src0, float src1, float src2, float src3) __HC__
Takes four floating-point number, convers them to unsigned integer values, and packs them into a pack...
 
Definition: kalmar_math.h:691
 
array_view(const array_view< nc_T, N > &other) __CPU__ __HC__
Copy constructor. 
Definition: hc.hpp:5881
 
array_view(int e0, Container &src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5848
 
array(const extent< N > &ext)
Constructs a new array with the supplied extent, located on the default view of the default accelerat...
Definition: hc.hpp:4264
 
int __mad24(int x, int y, int z)[[hc]]
Multiply two integers (x,y) but only the lower 24 bits will be used in the multiplication and then ad...
Definition: hc.hpp:3210
 
float __unpackcvt_f32_u8x4(unsigned int src0, unsigned int src1) __HC__
Unpacks a single element from a packed u8x4 value and converts it to an f32. 
 
array(int e0, int e1, int e2, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory. 
Definition: hc.hpp:4389
 
float __amdgcn_ds_permute(int index, float src)[[hc]]
Direct copy from indexed active work-item within a wavefront. 
Definition: hc.hpp:2880
 
unsigned int __popcount_u32_b64(unsigned long long int input) __HC__
Count number of 1 bits in the input. 
Definition: hc.hpp:2399
 
array(int e0, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)". 
Definition: hc.hpp:4317
 
array(const array_view< const T, N > &src, accelerator_view av, accelerator_view associated_av)
Constructs a staging array initialized with the array_view given by "src", which acts as a staging ar...
Definition: hc.hpp:4621
 
Represents an extent subdivided into tiles. 
Definition: hc.hpp:2122
 
extent & operator+=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  value. 
Definition: hc.hpp:1816
 
void then(const functor &func)
This method enables specification of a completion callback func which is executed upon completion of ...
Definition: hc.hpp:1279
 
tiled_extent(const extent< 3 > &ext, int t0, int t1, int t2) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext". 
Definition: hc.hpp:2283
 
array(int e0, int e1, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)". 
Definition: hc.hpp:4425
 
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4526
 
int64_t __unpacklo_s16x4(int64_t src0, int64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation. 
 
Represents a logical (isolated) accelerator view of a compute accelerator. 
Definition: hc.hpp:120
 
int tile_dim[1]
Tile size for each dimension. 
Definition: hc.hpp:2044
 
Represents an N-dimensional region of memory (with type T) located on an accelerator. 
Definition: hc.hpp:61
 
const T & operator()(int i0, int i1) const __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const". 
Definition: hc.hpp:4869
 
tiled_extent(const extent< 1 > &ext, int t0, int size) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext". 
Definition: hc.hpp:2096
 
uint64_t get_tick_frequency()
Get the frequency of ticks per second for the underlying asynchrnous operation. 
Definition: hc.hpp:1344
 
access_type get_default_cpu_access_type() const 
Get the default cpu access_type for buffers created on this accelerator. 
Definition: hc.hpp:946
 
unsigned int __unpackhi_u16x2(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation. 
 
array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)". 
Definition: hc.hpp:4646
 
uint64_t get_system_ticks()
Get the current tick count for the GPU platform. 
Definition: hc.hpp:93
 
bool operator==(const accelerator_view &other) const 
Compares "this" accelerator_view with the passed accelerator_view object to determine if they represe...
Definition: hc.hpp:419
 
array_view< T, K > view_as(const extent< K > &viewExtent) __CPU__ __HC__
An array of higher rank can be reshaped into an array of lower rank, or vice versa, using the view_as member function. 
Definition: hc.hpp:5078
 
extent< N > get_extent() const __CPU__ __HC__
Access the extent that defines the shape of this array_view. 
Definition: hc.hpp:5897
 
Represents an extent subdivided into tiles. 
Definition: hc.hpp:59
 
array_view< T, N > section(const index< N > &idx) __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)". 
Definition: hc.hpp:4949
 
uint64_t get_end_tick()
Get the tick number when the underlying asynchronous operation ends. 
Definition: hc.hpp:1330
 
array(int e0, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)". 
Definition: hc.hpp:4320
 
void * get_hsa_am_region()
Returns an opaque handle which points to the AM region on the HSA agent. 
Definition: hc.hpp:490
 
execute_order get_execute_order() const 
Returns the execution order of this accelerator_view. 
Definition: hc.hpp:157
 
extent(int e0) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided by . 
Definition: hc.hpp:1616
 
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)". 
Definition: hc.hpp:4655
 
completion_future(completion_future &&other)
Move constructor. 
Definition: hc.hpp:1159
 
bool is_hsa_accelerator() const 
Returns if the accelerator is based on HSA. 
Definition: hc.hpp:1019
 
unsigned int __unpack_u32_u8x4(unsigned int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0. 
 
int __bitextract_s32(int src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits. 
 
bool get_supports_limited_double_precision() const 
Returns a boolean value indicating whether the accelerator has limited double precision support (excl...
Definition: hc.hpp:922
 
const index< 1 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: hc.hpp:3575
 
array(const extent< N > &ext, InputIter srcBegin, InputIter srcEnd)
Constructs a new array with the supplied extent, located on the default accelerator, initialized with the contents of a source container specified by a beginning and optional ending iterator. 
Definition: hc.hpp:4301
 
uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
 
extent(_Tp...__t) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided by . 
Definition: hc.hpp:1620
 
array_view(int e0)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]))". 
Definition: hc.hpp:5258
 
int __unpacklo_s8x4(int src0, int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation. 
 
tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2250
 
accelerator_view create_view(execute_order order=execute_in_order, queuing_mode mode=queuing_mode_automatic)
Creates and returns a new accelerator view on the accelerator with the supplied queuing mode...
Definition: hc.hpp:823
 
array_view< const T, 1 > section(int i0, int e0) const __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:4993
 
void refresh() const 
Calling this member function informs the array_view that its bound memory has been modified outside t...
Definition: hc.hpp:5376
 
int __unpack_s32_s8x4(int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0. 
 
array_projection_helper< T, N >::const_result_type operator()(int i0) const __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const". 
Definition: hc.hpp:4906
 
const index< 1 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: hc.hpp:3569
 
int get_pending_async_ops()
Returns the number of pending asynchronous operations on this accelerator view. 
Definition: hc.hpp:447
 
int get_use_count() const 
Definition: hc.hpp:1381
 
bool get_has_display() const 
This property indicates that the accelerator may be shared by (and thus have interference from) the o...
Definition: hc.hpp:900
 
unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits. 
Definition: hc.hpp:2409
 
tiled_extent(int e0, int t0) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2059
 
index< N > operator+(const index< N > &lhs, const index< N > &rhs)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:492
 
extent(int components[]) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided the array of int component values...
Definition: hc.hpp:1647
 
std::future_status wait_for(const std::chrono::duration< _Rep, _Period > &_Rel_time) const 
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: hc.hpp:1248
 
unsigned __pack_u16x2_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
 
array_view< const T, 2 > section(int i0, int i1, int e0, int e1) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))". 
Definition: hc.hpp:6225
 
void wait(hcWaitMode waitMode=hcWaitModeBlocked)
Performs a blocking wait for completion of all commands submitted to the accelerator view prior to ca...
Definition: hc.hpp:208
 
unsigned int __unpacklo_u8x4(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation. 
 
accelerator_view get_accelerator_view() const 
This property returns the accelerator_view representing the location where this array has been alloca...
Definition: hc.hpp:4669
 
unsigned int __atomic_wrapinc(unsigned int *address, unsigned int val) __HC__
Atomically do the following operations: 
 
void synchronize_to(const accelerator_view &av) const 
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:5464
 
array(const extent< N > &ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator_vi...
Definition: hc.hpp:4596
 
Represents a unique position in N-dimensional space. 
Definition: kalmar_index.h:226
 
array(const extent< N > &ext, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator vi...
Definition: hc.hpp:4549
 
uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits. 
 
array_view< const T, N > section(const extent< N > &ext) const __CPU__ __HC__
Equivalent to "section(index<N>(), ext)". 
Definition: hc.hpp:4972
 
int __unpack_s32_s8x8(int64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0. 
 
size_t get_max_tile_static_size()
Returns the maximum size of tile static area available on this accelerator. 
Definition: hc.hpp:953
 
bool operator!=(const accelerator_view &other) const 
Compares "this" accelerator_view with the passed accelerator_view object to determine if they represe...
Definition: hc.hpp:431
 
array_view(int e0, value_type *src) __CPU__ __HC__
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5244
 
extent & operator/=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent. 
Definition: hc.hpp:1767
 
const index< 2 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: hc.hpp:3673
 
void copy_to(array &dest) const 
Copies the contents of this array to the array given by "dest", as if by calling "copy(*this, dest)". 
Definition: hc.hpp:4735
 
extent & operator/=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  value. 
Definition: hc.hpp:1828
 
tiled_extent(int e0, int e1, int t0, int t1, int size) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2164
 
array & operator=(const array_view< T, N > &src)
Assigns the contents of the array_view "src", as if by calling "copy(src, *this)". 
Definition: hc.hpp:4722
 
int __any(int predicate) __HC__
Evaluate predicate for all active work-items in the wavefront and return non-zero if and only if pred...
Definition: hc.hpp:2781
 
tiled_extent(const extent< 2 > &ext, int t0, int t1) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext". 
Definition: hc.hpp:2182
 
int64_t __pack_s32x2_s32(int64_t src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
 
unsigned int __bytealign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Align 32 bits within 64 bis of data on an arbitrary byte boundary. 
 
extent & operator%=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent. 
Definition: hc.hpp:1771
 
unsigned int __lastbit_u32_s32(int input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit. 
Definition: hc.hpp:2543
 
const T & operator()(int i0, int i1, int i2) const __CPU__ __HC__
Equivalent to "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))". 
Definition: hc.hpp:6132
 
unsigned int __firstbit_u32_s32(int input) __HC__
Count leading zero bits in the input. 
Definition: hc.hpp:2502
 
bool is_ready()
Get if the async operations has been completed. 
Definition: hc.hpp:1357
 
access_type get_cpu_access_type() const 
This property returns the CPU "access_type" allowed for this array. 
Definition: hc.hpp:4680
 
T & operator()(const index< N > &idx) const __CPU__ __HC__
Returns a reference to the element of this array_view that is at the location in N-dimensional space ...
Definition: hc.hpp:5517
 
tiled_extent(const tiled_extent< 3 > &other) __CPU__ __HC__
Copy constructor. 
Definition: hc.hpp:2273
 
uint64_t __unpackhi_u32x2(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation. 
 
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)". 
Definition: hc.hpp:4332
 
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles. 
Definition: hc.hpp:3640
 
uint64_t __cycle_u64() __HC__
Get hardware cycle count. 
 
unsigned int get_group_segment_size() __HC__
Fetch the size of group segment. 
 
float __amdgcn_wave_rr1(float src)[[hc]]
Direct copy from indexed active work-item within a wavefront. 
Definition: hc.hpp:2963
 
uint64_t __bitmask_b64(unsigned int src0, unsigned int src1) __HC__
Create a bit mask that can be used with bitselect. 
 
array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)". 
Definition: hc.hpp:4427
 
uint64_t __pack_u32x2_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
 
extent & operator--() __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  1. 
Definition: hc.hpp:1856
 
completion_future & operator=(completion_future &&_Other)
Move assignment. 
Definition: hc.hpp:1188
 
array(int e0, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)". 
Definition: hc.hpp:4568
 
const T & operator[](const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array_view that is at the location in N-dimensional ...
Definition: hc.hpp:6088
 
unsigned int __sadhi_u16x2_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
This function is mostly the same as sad except the sum of absolute differences is added to the most s...
 
tiled_extent(int e0, int t0, int size) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2069
 
float __amdgcn_ds_bpermute(int index, float src)[[hc]]
Direct copy from indexed active work-item within a wavefront. 
Definition: hc.hpp:2865
 
bool set_cu_mask(const std::vector< bool > &cu_mask)
Set a CU affinity to specific command queues. 
Definition: hc.hpp:618
 
T * data() const __CPU__ __HC__
Returns a pointer to the raw data underlying this array. 
Definition: hc.hpp:4760
 
T * data() const __CPU__ __HC__
Returns a pointer to the first data element underlying this array_view. 
Definition: hc.hpp:5352
 
int __bitinsert_s32(int src0, int src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits. 
 
uint64_t __ballot(int predicate) __HC__
Evaluate predicate for all active work-items in the wavefront and return an integer whose Nth bit is ...
Definition: hc.hpp:2800
 
int __pack_s8x4_s32(int src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
 
array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)". 
Definition: hc.hpp:4640
 
extent & operator++() __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  1. 
Definition: hc.hpp:1847
 
index< N > operator*(const index< N > &idx, int value)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:547
 
array_view< T, 2 > section(int i0, int i1, int e0, int e1) __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:5001
 
index< N > operator%(const index< N > &idx, int value)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:571
 
extent< N > get_extent() const __CPU__ __HC__
Access the extent that defines the shape of this array_view. 
Definition: hc.hpp:5278
 
array(int e0, int e1, int e2, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)". 
Definition: hc.hpp:4329
 
int64_t __unpackhi_s32x2(int64_t src0, int64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation. 
 
unsigned int __activelanecount_u32_b1(unsigned int input) __HC__
Count the number of active work-items in the current wavefront that have a non-zero input...
Definition: hc.hpp:2768
 
array(const array_view< const T, N > &src)
Constructs a new array, located on the default view of the default accelerator, initialized with the ...
Definition: hc.hpp:4348
 
array(const extent< N > &ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator_vi...
Definition: hc.hpp:4593
 
extent & operator-=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent. 
Definition: hc.hpp:1759
 
uint64_t __bitrev_b64(uint64_t src0)[[hc]] __asm("llvm.bitreverse.i64")
Reverse the bits. 
 
bool get_supports_cpu_shared_memory() const 
Returns a boolean value indicating whether the accelerator supports memory accessible both by the acc...
Definition: hc.hpp:941
 
uint64_t __unpacklo_u32x2(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation. 
 
float __amdgcn_wave_sr1(float src, bool bound_ctrl)[[hc]]
Direct copy from indexed active work-item within a wavefront. 
Definition: hc.hpp:2923
 
int __all(int predicate) __HC__
Evaluate predicate for all active work-items in the wavefront and return non-zero if and only if pred...
Definition: hc.hpp:2790
 
accelerator(const std::wstring &path)
Constructs a new accelerator object that represents the physical device named by the "path" argument...
Definition: hc.hpp:730
 
void set_dynamic_group_segment_size(unsigned int size) __CPU__
Set the size of dynamic group segment. 
Definition: hc.hpp:2302
 
bool get_is_emulated() const 
Returns a boolean value indicating whether the accelerator is emulated. 
Definition: hc.hpp:935
 
bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__
These functions attempt to perform these three steps atomically: 
 
completion_future & operator=(const completion_future &_Other)
Copy assignment. 
Definition: hc.hpp:1170
 
int64_t __pack_s16x4_s32(int64_t src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
 
extent & operator=(const extent &other) __CPU__ __HC__
Assigns the component values of "other" to this extent<N> object. 
Definition: hc.hpp:1657
 
int __unpack_s32_s16x2(int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0. 
 
const T & operator()(int i0) const __CPU__ __HC__
Equivalent to "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))". 
Definition: hc.hpp:6123
 
array_view< T, N > section(const index< N > &idx, const extent< N > &ext) const __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:5602
 
array(int e0, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)". 
Definition: hc.hpp:4423
 
int __unpackhi_s16x2(int src0, int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation. 
 
array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4529
 
Heterogeneous C++ (HC) namespace. 
Definition: grid_launch.h:10
 
int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__
Copy from an active work-item based on bitwise XOR of caller work-item ID within a wavefront...
Definition: hc.hpp:3142
 
int __pack_s16x2_s32(int src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
 
void * get_hsa_am_system_region() const 
Returns an opaque handle which points to the AM system region on the HSA agent. 
Definition: hc.hpp:989
 
extent operator++(int) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  1. 
Definition: hc.hpp:1851
 
static std::vector< accelerator > get_all()
Returns a std::vector of accelerator objects (in no specific order) representing all accelerators tha...
Definition: hc.hpp:749
 
bool get_supports_double_precision() const 
Returns a Boolean value indicating whether this accelerator supports double-precision (double) comput...
Definition: hc.hpp:914
 
unsigned int __unpack_u32_u16x4(uint64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0. 
 
std::vector< accelerator_view > get_all_views()
Returns a vector of all accelerator_view associated with this accelerator. 
Definition: hc.hpp:960
 
array_view< T, 1 > section(int i0, int e0) __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:4989
 
void copy_to(const array_view &dest) const 
Copies the contents of this array_view to the array_view given by "dest", as if by calling "copy(*thi...
Definition: hc.hpp:5335
 
array_view< T, N > section(const extent< N > &ext) __CPU__ __HC__
Equivalent to "section(index<N>(), ext)". 
Definition: hc.hpp:4968
 
T & operator()(int i0, int i1, int i2) __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))". 
Definition: hc.hpp:4855
 
array(const extent< N > &ext, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array with the supplied extent, located on the accelerator bound to the accelerator_...
Definition: hc.hpp:4460
 
array(int e0, int e1, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)". 
Definition: hc.hpp:4323
 
bool get_is_debug() const 
Returns a boolean value indicating whether the accelerator supports debugging. 
Definition: hc.hpp:929
 
int __unpackhi_s8x4(int src0, int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation. 
 
const T & operator()(int i0, int i1) const __CPU__ __HC__
Equivalent to "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))". 
Definition: hc.hpp:6128
 
uint64_t __unpacklo_u16x4(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation. 
 
array_view & operator=(const array_view &other) __CPU__ __HC__
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: hc.hpp:5928
 
void tile_static_memory_fence(const tile_barrier &) __HC__
Establishes a thread-tile scoped memory fence for tile-static (but not global) memory operations...
 
extent & operator-=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  value. 
Definition: hc.hpp:1820
 
size_t get_max_tile_static_size()
Returns the maximum size of tile static area available on this accelerator view. 
Definition: hc.hpp:437
 
unsigned int __lastbit_u32_s64(unsigned long long input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit. 
Definition: hc.hpp:2547
 
uint64_t __unpackhi_u16x4(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation. 
 
#define __HSA_WAVEFRONT_SIZE__
Fetch the size of a wavefront. 
Definition: hc.hpp:2373
 
uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
 
int64_t __pack_s8x8_s32(int64_t src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
 
tile_barrier(const tile_barrier &other) __CPU__ __HC__
Copy constructor. 
Definition: hc.hpp:3317
 
int64_t __bitinsert_s64(int64_t src0, int64_t src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits. 
 
void * get_hsa_queue()
Returns an opaque handle which points to the underlying HSA queue. 
Definition: hc.hpp:468
 
unsigned int __unpack_u32_u16x2(unsigned int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0. 
 
void set_dynamic_group_segment_size(unsigned int size) __CPU__
Set the size of dynamic group segment. 
Definition: hc.hpp:2104
 
array(const extent< N > &ext, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory. 
Definition: hc.hpp:4392
 
bool valid() const 
This method is functionally identical to std::shared_future<void>::valid. 
Definition: hc.hpp:1213
 
T * accelerator_pointer() const __CPU__ __HC__
Returns a pointer to the device memory underlying this array. 
Definition: hc.hpp:4775
 
array_view(const array_view &other) __CPU__ __HC__
Copy constructor. 
Definition: hc.hpp:5272
 
array_view< const ElementType, 1 > reinterpret_as() const __CPU__ __HC__
Sometimes it is desirable to view the data of an N-dimensional array as a linear array, possibly with a (unsafe) reinterpretation of the element type. 
Definition: hc.hpp:5051
 
unsigned int atomic_fetch_dec(unsigned int *_Dest) __CPU__ __HC__
Atomically increment or decrement the value stored at the location point to by dest. 
 
array(array &&other)
Move constructor. 
Definition: hc.hpp:4253
 
uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
 
T value_type
The element type of this array. 
Definition: hc.hpp:4227
 
array_view< T, K > view_as(extent< K > viewExtent) const __CPU__ __HC__
This member function is similar to "array<T,N>::view_as", although it only supports array_views of ra...
Definition: hc.hpp:5693
 
unsigned int __activelaneid_u32() __HC__
Get the count of the number of earlier (in flattened work-item order) active work-items within the sa...
 
array_view< const T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:5009
 
unsigned int __firstbit_u32_s64(long long int input) __HC__
Count leading zero bits in the input. 
Definition: hc.hpp:2520
 
array_view< ElementType, 1 > reinterpret_as() __CPU__ __HC__
Sometimes it is desirable to view the data of an N-dimensional array as a linear array, possibly with a (unsafe) reinterpretation of the element type. 
Definition: hc.hpp:5038
 
const index< 2 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: hc.hpp:3667
 
int __amdgcn_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl)[[hc]]
move DPP intrinsic 
 
unsigned int __bitalign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Align 32 bits within 64 bits of data on an arbitrary bit boundary. 
 
const T & operator()(int i0, int i1, int i2) const __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const". 
Definition: hc.hpp:4872
 
int __unpack_s32_s3x2(int64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0. 
 
void synchronize() const 
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:5412
 
const index< 3 > global
An index of rank 1, 2, or 3 that represents the global index within an extent. 
Definition: hc.hpp:3462
 
const index< 3 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: hc.hpp:3480
 
unsigned int get_dynamic_group_segment_size() const __CPU__
Return the size of dynamic group segment in bytes. 
Definition: hc.hpp:2111
 
array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4517
 
void all_memory_fence(const tile_barrier &) __HC__
Establishes a thread-tile scoped memory fence for both global and tile-static memory operations...
 
bool is_hsa_accelerator()
Returns if the accelerator view is based on HSA. 
Definition: hc.hpp:533
 
array_view< T, 2 > section(int i0, int i1, int e0, int e1) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))". 
Definition: hc.hpp:5644
 
unsigned int __pack_u8x4_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...
 
bool operator==(const accelerator &other) const 
Compares "this" accelerator with the passed accelerator object to determine if they represent the sam...
Definition: hc.hpp:837
 
std::future_status wait_until(const std::chrono::time_point< _Clock, _Duration > &_Abs_time) const 
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: hc.hpp:1253
 
Definition: kalmar_math.h:297
 
unsigned int __firstbit_u32_u64(unsigned long long int input) __HC__
Count leading zero bits in the input. 
Definition: hc.hpp:2489
 
Represents a physical accelerated computing device. 
Definition: hc.hpp:700
 
Definition: kalmar_runtime.h:14
 
int get_seqnum() const 
Return the unique integer sequence-number for the accelerator. 
Definition: hc.hpp:1095
 
unsigned int __atomic_wrapdec(unsigned int *address, unsigned int val) __HC__
Atomically do the following operations: 
 
int __lane_id(void)[[hc]]
Direct copy from indexed active work-item within a wavefront. 
Definition: hc.hpp:2845
 
array_view(const array< T, N > &src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" array. ...
Definition: hc.hpp:5796
 
float atomic_fetch_sub(float *x, float y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
 
tiled_index(const tiled_index &other) __CPU__ __HC__
Copy constructor. 
Definition: hc.hpp:3557
 
array_view(const extent< N > &extent, const Container &src)
Constructs an array_view which is bound to the data contained in the "src" container. 
Definition: hc.hpp:5816
 
float __shfl_down(float var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__
Copy from an active work-item with higher ID relative to caller within a wavefront. 
Definition: hc.hpp:3111
 
extent operator+(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent. 
Definition: hc.hpp:1786
 
static accelerator_view get_auto_selection_view()
Returns an accelerator_view which when passed as the first argument to a parallel_for_each call cause...
Definition: hc.hpp:789