hcc/hc_8hpp_source.html

 //===----------------------------------------------------------------------===//
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//

 #pragma once

 #include "hc_defines.h"
 #include "kalmar_exception.h"
 #include "kalmar_index.h"
 #include "kalmar_runtime.h"
 #include "kalmar_serialize.h"
 #include "kalmar_launch.h"
 #include "kalmar_buffer.h"
 #include "kalmar_math.h"

 #include "hsa_atomic.h"
 #include "kalmar_cpu_launch.h"
 #include "hcc_features.hpp"

 #ifndef __HC__
 #   define __HC__ [[hc]]
 #endif

 #ifndef __CPU__
 #   define __CPU__ [[cpu]]
 #endif

 typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t;

 namespace Kalmar {
     class HSAQueue;
 };

 namespace hc {

 class AmPointerInfo;

 using namespace Kalmar::enums;
 using namespace Kalmar::CLAMP;


 // forward declaration
 class accelerator;
 class accelerator_view;
 class completion_future;
 template <int N> class extent;
 template <int N> class tiled_extent;
 template <typename T, int N> class array_view;
 template <typename T, int N> class array;


 // namespace alias
 // namespace hc::fast_math is an alias of namespace Kalmar::fast_math
 namespace fast_math = Kalmar::fast_math;

 // namespace hc::precise_math is an alias of namespace Kalmar::precise_math
 namespace precise_math = Kalmar::precise_math;

 // type alias

 template <int N>
 using index = Kalmar::index<N>;

 using runtime_exception = Kalmar::runtime_exception;
 using invalid_compute_domain = Kalmar::invalid_compute_domain;
 using accelerator_view_removed = Kalmar::accelerator_view_removed;

 // ------------------------------------------------------------------------
 // global functions
 // ------------------------------------------------------------------------

 inline uint64_t get_system_ticks() {
     return Kalmar::getContext()->getSystemTicks();
 }

 inline uint64_t get_tick_frequency() {
     return Kalmar::getContext()->getSystemTickFrequency();
 }

 #define GET_SYMBOL_ADDRESS(acc, symbol) \
     acc.get_symbol_address( #symbol );


 // ------------------------------------------------------------------------
 // accelerator_view
 // ------------------------------------------------------------------------

 class accelerator_view {
 public:
     accelerator_view(const accelerator_view& other) :
         pQueue(other.pQueue) {}

     accelerator_view& operator=(const accelerator_view& other) {
         pQueue = other.pQueue;
         return *this;
     }

     queuing_mode get_queuing_mode() const { return pQueue->get_mode(); }

     execute_order get_execute_order() const { return pQueue->get_execute_order(); }

     // FIXME: dummy implementation now
     bool get_is_auto_selection() { return false; }

     unsigned int get_version() const;

     accelerator get_accelerator() const;

     // FIXME: dummy implementation now
     bool get_is_debug() const { return 0; }

     void wait(hcWaitMode waitMode = hcWaitModeBlocked) {
       pQueue->wait(waitMode);
       Kalmar::getContext()->flushPrintfBuffer();
     }

     void flush() { pQueue->flush(); }

     completion_future create_marker(memory_scope fence_scope=system_scope) const;

     completion_future create_blocking_marker(completion_future& dependent_future, memory_scope fence_scope=system_scope) const;

     completion_future create_blocking_marker(std::initializer_list<completion_future> dependent_future_list, memory_scope fence_scope=system_scope) const;


     template<typename InputIterator>
     completion_future create_blocking_marker(InputIterator first, InputIterator last, memory_scope scope) const;

     void copy(const void *src, void *dst, size_t size_bytes) {
         pQueue->copy(src, dst, size_bytes);
     }


     void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, const hc::accelerator *copyAcc, bool forceUnpinnedCopy);


     // TODO - this form is deprecated, provided for use with older HIP runtimes.
     void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, bool forceUnpinnedCopy) ;

     completion_future copy_async(const void *src, void *dst, size_t size_bytes);


     completion_future copy_async_ext(const void *src, void *dst, size_t size_bytes,
                                      hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo,
                                      const hc::accelerator *copyAcc);

     bool operator==(const accelerator_view& other) const {
         return pQueue == other.pQueue;
     }

     bool operator!=(const accelerator_view& other) const { return !(*this == other); }

     size_t get_max_tile_static_size() {
         return pQueue.get()->getDev()->GetMaxTileStaticSize();
     }

     int get_pending_async_ops() {
         return pQueue->getPendingAsyncOps();
     }

     bool get_is_empty() {
         return pQueue->isEmpty();
     }

     void* get_hsa_queue() {
         return pQueue->getHSAQueue();
     }

     void* get_hsa_agent() {
         return pQueue->getHSAAgent();
     }

     void* get_hsa_am_region() {
         return pQueue->getHSAAMRegion();
     }


     void* get_hsa_am_system_region() {
         return pQueue->getHSAAMHostRegion();
     }

     void* get_hsa_am_finegrained_system_region() {
         return pQueue->getHSACoherentAMHostRegion();
     }

     void* get_hsa_kernarg_region() {
         return pQueue->getHSAKernargRegion();
     }

     bool is_hsa_accelerator() {
         return pQueue->hasHSAInterOp();
     }

     void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql,
                            const void * args, size_t argsize,
                            hc::completion_future *cf=nullptr, const char *kernel_name = nullptr)
     {
         pQueue->dispatch_hsa_kernel(aql, args, argsize, cf, kernel_name);
     }

      bool set_cu_mask(const std::vector<bool>& cu_mask) {
         // If it is HSA based accelerator view, set cu mask, otherwise, return;
         if(is_hsa_accelerator()) {
             return pQueue->set_cu_mask(cu_mask);
         }
         return false;
      }

 private:
     accelerator_view(std::shared_ptr<Kalmar::KalmarQueue> pQueue) : pQueue(pQueue) {}
     std::shared_ptr<Kalmar::KalmarQueue> pQueue;

     friend class accelerator;
     template <typename Q, int K> friend class array;
     template <typename Q, int K> friend class array_view;

     template<typename Kernel> friend
         void* Kalmar::mcw_cxxamp_get_kernel(const std::shared_ptr<Kalmar::KalmarQueue>&, const Kernel&);
     template<typename Kernel, int dim_ext> friend
         void Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&, void*, size_t);
     template<typename Kernel, int dim_ext> friend
         std::shared_ptr<Kalmar::KalmarAsyncOp> Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&, void*, size_t);
     template<typename Kernel, int dim_ext> friend
         void Kalmar::mcw_cxxamp_launch_kernel(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&);
     template<typename Kernel, int dim_ext> friend
         std::shared_ptr<Kalmar::KalmarAsyncOp> Kalmar::mcw_cxxamp_launch_kernel_async(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&);

 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     template <typename Kernel, int N> friend
         completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>&, Kernel const&, extent<N> const&);
 #endif

     // non-tiled parallel_for_each
     // generic version
     template <int N, typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const extent<N>&, const Kernel&);

     // 1D specialization
     template <typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const extent<1>&, const Kernel&);

     // 2D specialization
     template <typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const extent<2>&, const Kernel&);

     // 3D specialization
     template <typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const extent<3>&, const Kernel&);

     // tiled parallel_for_each, 3D version
     template <typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&);

     // tiled parallel_for_each, 2D version
     template <typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&);

     // tiled parallel_for_each, 1D version
     template <typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&);


 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
 public:
 #endif
     __attribute__((annotate("user_deserialize")))
     accelerator_view() __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         throw runtime_exception("errorMsg_throw", 0);
 #endif
     }
 };

 // ------------------------------------------------------------------------
 // accelerator
 // ------------------------------------------------------------------------

 class accelerator
 {
 public:
     accelerator() : accelerator(L"default") {}

     explicit accelerator(const std::wstring& path)
         : pDev(Kalmar::getContext()->getDevice(path)) {}

     accelerator(const accelerator& other) : pDev(other.pDev) {}

     static std::vector<accelerator> get_all() {
         auto Devices = Kalmar::getContext()->getDevices();
         std::vector<accelerator> ret;
         for(auto&& i : Devices)
           ret.push_back(i);
         return ret;
     }

     static bool set_default(const std::wstring& path) {
         return Kalmar::getContext()->set_default(path);
     }

     static accelerator_view get_auto_selection_view() {
         return Kalmar::getContext()->auto_select();
     }

     accelerator& operator=(const accelerator& other) {
         pDev = other.pDev;
         return *this;
     }

     accelerator_view get_default_view() const { return pDev->get_default_queue(); }

     accelerator_view create_view(execute_order order = execute_in_order, queuing_mode mode = queuing_mode_automatic) {
         auto pQueue = pDev->createQueue(order);
         pQueue->set_mode(mode);
         return pQueue;
     }

     bool operator==(const accelerator& other) const { return pDev == other.pDev; }

     bool operator!=(const accelerator& other) const { return !(*this == other); }

     bool set_default_cpu_access_type(access_type type) {
         pDev->set_access(type);
         return true;
     }

     std::wstring get_device_path() const { return pDev->get_path(); }

     std::wstring get_description() const { return pDev->get_description(); }

     unsigned int get_version() const { return pDev->get_version(); }

     // FIXME: dummy implementation now
     bool get_has_display() const { return false; }

     size_t get_dedicated_memory() const { return pDev->get_mem(); }

     bool get_supports_double_precision() const { return pDev->is_double(); }

     bool get_supports_limited_double_precision() const { return pDev->is_lim_double(); }

     // FIXME: dummy implementation now
     bool get_is_debug() const { return false; }

     bool get_is_emulated() const { return pDev->is_emulated(); }

     bool get_supports_cpu_shared_memory() const { return pDev->is_unified(); }

     access_type get_default_cpu_access_type() const { return pDev->get_access(); }


     size_t get_max_tile_static_size() {
       return get_default_view().get_max_tile_static_size();
     }

     std::vector<accelerator_view> get_all_views() {
         std::vector<accelerator_view> result;
         std::vector< std::shared_ptr<Kalmar::KalmarQueue> > queues = pDev->get_all_queues();
         for (auto q : queues) {
             result.push_back(q);
         }
         return result;
     }

     void* get_hsa_am_region() const {
         return get_default_view().get_hsa_am_region();
     }

     void* get_hsa_am_system_region() const {
         return get_default_view().get_hsa_am_system_region();
     }

     void* get_hsa_am_finegrained_system_region() const {
         return get_default_view().get_hsa_am_finegrained_system_region();
     }

     void* get_hsa_kernarg_region() const {
         return get_default_view().get_hsa_kernarg_region();
     }

     bool is_hsa_accelerator() const {
         return get_default_view().is_hsa_accelerator();
     }

     hcAgentProfile get_profile() const {
         return pDev->getProfile();
     }

     void memcpy_symbol(const char* symbolName, void* hostptr, size_t count, size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {
         pDev->memcpySymbol(symbolName, hostptr, count, offset, kind);
     }

     void memcpy_symbol(void* symbolAddr, void* hostptr, size_t count, size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {
         pDev->memcpySymbol(symbolAddr, hostptr, count, offset, kind);
     }

     void* get_symbol_address(const char* symbolName) {
         return pDev->getSymbolAddress(symbolName);
     }

     void* get_hsa_agent() const {
         return pDev->getHSAAgent();
     }

     bool get_is_peer(const accelerator& other) const {
         return pDev->is_peer(other.pDev);
     }

     std::vector<accelerator> get_peers() const {
         std::vector<accelerator> peers;

         const auto &accs = get_all();

         for(auto iter = accs.begin(); iter != accs.end(); iter++)
         {
             if(this->get_is_peer(*iter))
                 peers.push_back(*iter);
         }
         return peers;
     }

     unsigned int get_cu_count() const {
         return pDev->get_compute_unit_count();
     }

     int get_seqnum() const {
         return pDev->get_seqnum();
     }


     bool has_cpu_accessible_am() {
         return pDev->has_cpu_accessible_am();
     };

     Kalmar::KalmarDevice *get_dev_ptr() const { return pDev; };

 private:
     accelerator(Kalmar::KalmarDevice* pDev) : pDev(pDev) {}
     friend class accelerator_view;
     Kalmar::KalmarDevice* pDev;
 };

 // ------------------------------------------------------------------------
 // completion_future
 // ------------------------------------------------------------------------

 class completion_future {
 public:

     completion_future() : __amp_future(), __thread_then(nullptr), __asyncOp(nullptr) {};

     completion_future(const completion_future& other)
         : __amp_future(other.__amp_future), __thread_then(other.__thread_then), __asyncOp(other.__asyncOp) {}

     completion_future(completion_future&& other)
         : __amp_future(std::move(other.__amp_future)), __thread_then(other.__thread_then), __asyncOp(other.__asyncOp) {}

     completion_future& operator=(const completion_future& _Other) {
         if (this != &_Other) {
            __amp_future = _Other.__amp_future;
            __thread_then = _Other.__thread_then;
            __asyncOp = _Other.__asyncOp;
         }
         return (*this);
     }

     completion_future& operator=(completion_future&& _Other) {
         if (this != &_Other) {
             __amp_future = std::move(_Other.__amp_future);
             __thread_then = _Other.__thread_then;
            __asyncOp = _Other.__asyncOp;
         }
         return (*this);
     }

     void get() const {
         __amp_future.get();
     }

     bool valid() const {
         return __amp_future.valid();
     }

     void wait(hcWaitMode mode = hcWaitModeBlocked) const {
         if (this->valid()) {
             if (__asyncOp != nullptr) {
                 __asyncOp->setWaitMode(mode);
             }
             //TODO-ASYNC - need to reclaim older AsyncOps here.
             __amp_future.wait();
         }

         Kalmar::getContext()->flushPrintfBuffer();
     }

     template <class _Rep, class _Period>
     std::future_status wait_for(const std::chrono::duration<_Rep, _Period>& _Rel_time) const {
         return __amp_future.wait_for(_Rel_time);
     }

     template <class _Clock, class _Duration>
     std::future_status wait_until(const std::chrono::time_point<_Clock, _Duration>& _Abs_time) const {
         return __amp_future.wait_until(_Abs_time);
     }

     operator std::shared_future<void>() const {
         return __amp_future;
     }

     // FIXME: notice we removed const from the signature here
     //        the original signature in the specification should be
     //        template<typename functor>
     //        void then(const functor& func) const;
     template<typename functor>
     void then(const functor & func) {
 #if __KALMAR_ACCELERATOR__ != 1
       // could only assign once
       if (__thread_then == nullptr) {
         // spawn a new thread to wait on the future and then execute the callback functor
         __thread_then = new std::thread([&]() __CPU__ {
           this->wait();
           if(this->valid())
             func();
         });
       }
 #endif
     }

     void* get_native_handle() const {
       if (__asyncOp != nullptr) {
         return __asyncOp->getNativeHandle();
       } else {
         return nullptr;
       }
     }

     uint64_t get_begin_tick() {
       if (__asyncOp != nullptr) {
         return __asyncOp->getBeginTimestamp();
       } else {
         return 0L;
       }
     }

     uint64_t get_end_tick() {
       if (__asyncOp != nullptr) {
         return __asyncOp->getEndTimestamp();
       } else {
         return 0L;
       }
     }

     uint64_t get_tick_frequency() {
       if (__asyncOp != nullptr) {
         return __asyncOp->getTimestampFrequency();
       } else {
         return 0L;
       }
     }

     bool is_ready() {
       if (__asyncOp != nullptr) {
         return __asyncOp->isReady();
       } else {
         return false;
       }
     }

     ~completion_future() {
       if (__thread_then != nullptr) {
         __thread_then->join();
       }
       delete __thread_then;
       __thread_then = nullptr;

       if (__asyncOp != nullptr) {
         __asyncOp = nullptr;
       }
     }


     int get_use_count() const { return __asyncOp.use_count(); };

 private:
     std::shared_future<void> __amp_future;
     std::thread* __thread_then = nullptr;
     std::shared_ptr<Kalmar::KalmarAsyncOp> __asyncOp;

     completion_future(std::shared_ptr<Kalmar::KalmarAsyncOp> event) : __amp_future(*(event->getFuture())), __asyncOp(event) {}

     completion_future(const std::shared_future<void> &__future)
         : __amp_future(__future), __thread_then(nullptr), __asyncOp(nullptr) {}

     friend class Kalmar::HSAQueue;

     // non-tiled parallel_for_each
     // generic version
     template <int N, typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const extent<N>&, const Kernel&);

     // 1D specialization
     template <typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const extent<1>&, const Kernel&);

     // 2D specialization
     template <typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const extent<2>&, const Kernel&);

     // 3D specialization
     template <typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const extent<3>&, const Kernel&);

     // tiled parallel_for_each, 3D version
     template <typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&);

     // tiled parallel_for_each, 2D version
     template <typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&);

     // tiled parallel_for_each, 1D version
     template <typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&);

     // copy_async
     template <typename T, int N> friend
         completion_future copy_async(const array_view<const T, N>& src, const array_view<T, N>& dest);
     template <typename T, int N> friend
         completion_future copy_async(const array<T, N>& src, array<T, N>& dest);
     template <typename T, int N> friend
         completion_future copy_async(const array<T, N>& src, const array_view<T, N>& dest);
     template <typename T, int N> friend
         completion_future copy_async(const array_view<T, N>& src, const array_view<T, N>& dest);
     template <typename T, int N> friend
         completion_future copy_async(const array_view<const T, N>& src, array<T, N>& dest);

     template <typename InputIter, typename T, int N> friend
         completion_future copy_async(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest);
     template <typename InputIter, typename T, int N> friend
         completion_future copy_async(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest);
     template <typename InputIter, typename T, int N> friend
         completion_future copy_async(InputIter srcBegin, array<T, N>& dest);
     template <typename InputIter, typename T, int N> friend
         completion_future copy_async(InputIter srcBegin, const array_view<T, N>& dest);
     template <typename OutputIter, typename T, int N> friend
         completion_future copy_async(const array<T, N>& src, OutputIter destBegin);
     template <typename OutputIter, typename T, int N> friend
         completion_future copy_async(const array_view<T, N>& src, OutputIter destBegin);

     // array_view
     template <typename T, int N> friend class array_view;

     // accelerator_view
     friend class accelerator_view;
 };

 // ------------------------------------------------------------------------
 // member function implementations
 // ------------------------------------------------------------------------

 inline accelerator
 accelerator_view::get_accelerator() const { return pQueue->getDev(); }

 inline completion_future
 accelerator_view::create_marker(memory_scope scope) const {
     std::shared_ptr<Kalmar::KalmarAsyncOp> deps[1];
     // If necessary create an explicit dependency on previous command
     // This is necessary for example if copy command is followed by marker - we need the marker to wait for the copy to complete.
     std::shared_ptr<Kalmar::KalmarAsyncOp> depOp = pQueue->detectStreamDeps(hcCommandMarker, nullptr);

     int cnt = 0;
     if (depOp) {
         deps[cnt++] = depOp; // retrieve async op associated with completion_future
     }

     return completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope));
 }

 inline unsigned int accelerator_view::get_version() const { return get_accelerator().get_version(); }

 inline completion_future accelerator_view::create_blocking_marker(completion_future& dependent_future, memory_scope scope) const {
     std::shared_ptr<Kalmar::KalmarAsyncOp> deps[2];

     // If necessary create an explicit dependency on previous command
     // This is necessary for example if copy command is followed by marker - we need the marker to wait for the copy to complete.
     std::shared_ptr<Kalmar::KalmarAsyncOp> depOp = pQueue->detectStreamDeps(hcCommandMarker, nullptr);

     int cnt = 0;
     if (depOp) {
         deps[cnt++] = depOp; // retrieve async op associated with completion_future
     }

     if (dependent_future.__asyncOp) {
         deps[cnt++] = dependent_future.__asyncOp; // retrieve async op associated with completion_future
     }

     return completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope));
 }

 template<typename InputIterator>
 inline completion_future
 accelerator_view::create_blocking_marker(InputIterator first, InputIterator last, memory_scope scope) const {
     std::shared_ptr<Kalmar::KalmarAsyncOp> deps[5]; // array of 5 pointers to the native handle of async ops. 5 is the max supported by barrier packet
     hc::completion_future lastMarker;


     // If necessary create an explicit dependency on previous command
     // This is necessary for example if copy command is followed by marker - we need the marker to wait for the copy to complete.
     std::shared_ptr<Kalmar::KalmarAsyncOp> depOp = pQueue->detectStreamDeps(hcCommandMarker, nullptr);

     int cnt = 0;
     if (depOp) {
         deps[cnt++] = depOp; // retrieve async op associated with completion_future
     }


     // loop through signals and group into sections of 5
     // every 5 signals goes into one barrier packet
     // since HC sets the barrier bit in each AND barrier packet, we know
     // the barriers will execute in-order
     for (auto iter = first; iter != last; ++iter) {
         if (iter->__asyncOp) {
             deps[cnt++] = iter->__asyncOp; // retrieve async op associated with completion_future
             if (cnt == 5) {
                 lastMarker = completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, hc::no_scope));
                 cnt = 0;
             }
         }
     }

     if (cnt) {
         lastMarker = completion_future(pQueue->EnqueueMarkerWithDependency(cnt, deps, scope));
     }

     return lastMarker;
 }

 inline completion_future
 accelerator_view::create_blocking_marker(std::initializer_list<completion_future> dependent_future_list, memory_scope scope) const {
     return create_blocking_marker(dependent_future_list.begin(), dependent_future_list.end(), scope);
 }


 inline void accelerator_view::copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, const hc::accelerator *copyAcc, bool forceUnpinnedCopy) {
     pQueue->copy_ext(src, dst, size_bytes, copyDir, srcInfo, dstInfo, copyAcc ? copyAcc->pDev : nullptr, forceUnpinnedCopy);
 };

 inline void accelerator_view::copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, bool forceHostCopyEngine) {
     pQueue->copy_ext(src, dst, size_bytes, copyDir, srcInfo, dstInfo, forceHostCopyEngine);
 };

 inline completion_future
 accelerator_view::copy_async(const void *src, void *dst, size_t size_bytes) {
     return completion_future(pQueue->EnqueueAsyncCopy(src, dst, size_bytes));
 }

 inline completion_future
 accelerator_view::copy_async_ext(const void *src, void *dst, size_t size_bytes,
                              hcCommandKind copyDir,
                              const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo,
                              const hc::accelerator *copyAcc)
 {
     return completion_future(pQueue->EnqueueAsyncCopyExt(src, dst, size_bytes, copyDir, srcInfo, dstInfo, copyAcc ? copyAcc->pDev : nullptr));
 };


 // ------------------------------------------------------------------------
 // extent
 // ------------------------------------------------------------------------

 template <int N>
 class extent {
 public:
     static const int rank = N;

     typedef int value_type;

     extent() __CPU__ __HC__ : base_() {
       static_assert(N > 0, "Dimensionality must be positive");
     };

     extent(const extent& other) __CPU__ __HC__
         : base_(other.base_) {}

     explicit extent(int e0) __CPU__ __HC__
         : base_(e0) {}

     template <typename ..._Tp>
         explicit extent(_Tp ... __t) __CPU__ __HC__
         : base_(__t...) {
       static_assert(sizeof...(__t) <= 3, "Can only supply at most 3 individual coordinates in the constructor");
       static_assert(sizeof...(__t) == N, "rank should be consistency");
     }

     explicit extent(const int components[]) __CPU__ __HC__
         : base_(components) {}

     explicit extent(int components[]) __CPU__ __HC__
         : base_(components) {}

     extent& operator=(const extent& other) __CPU__ __HC__ {
         base_.operator=(other.base_);
         return *this;
     }

     int operator[] (unsigned int c) const __CPU__ __HC__ {
         return base_[c];
     }
     int& operator[] (unsigned int c) __CPU__ __HC__ {
         return base_[c];
     }

     bool contains(const index<N>& idx) const __CPU__ __HC__ {
         return Kalmar::amp_helper<N, index<N>, extent<N>>::contains(idx, *this);
     }

     unsigned int size() const __CPU__ __HC__ {
         return Kalmar::index_helper<N, extent<N>>::count_size(*this);
     }

     tiled_extent<1> tile(int t0) const;
     tiled_extent<2> tile(int t0, int t1) const;
     tiled_extent<3> tile(int t0, int t1, int t2) const;

     tiled_extent<1> tile_with_dynamic(int t0, int dynamic_size) const;
     tiled_extent<2> tile_with_dynamic(int t0, int t1, int dynamic_size) const;
     tiled_extent<3> tile_with_dynamic(int t0, int t1, int t2, int dynamic_size) const;

     bool operator==(const extent& other) const __CPU__ __HC__ {
         return Kalmar::index_helper<N, extent<N> >::equal(*this, other);
     }
     bool operator!=(const extent& other) const __CPU__ __HC__ {
         return !(*this == other);
     }

     extent& operator+=(const extent& __r) __CPU__ __HC__ {
         base_.operator+=(__r.base_);
         return *this;
     }
     extent& operator-=(const extent& __r) __CPU__ __HC__ {
         base_.operator-=(__r.base_);
         return *this;
     }
     extent& operator*=(const extent& __r) __CPU__ __HC__ {
         base_.operator*=(__r.base_);
         return *this;
     }
     extent& operator/=(const extent& __r) __CPU__ __HC__ {
         base_.operator/=(__r.base_);
         return *this;
     }
     extent& operator%=(const extent& __r) __CPU__ __HC__ {
         base_.operator%=(__r.base_);
         return *this;
     }

     extent operator+(const index<N>& idx) __CPU__ __HC__ {
         extent __r = *this;
         __r += idx;
         return __r;
     }
     extent operator-(const index<N>& idx) __CPU__ __HC__ {
         extent __r = *this;
         __r -= idx;
         return __r;
     }
     extent& operator+=(const index<N>& idx) __CPU__ __HC__ {
         base_.operator+=(idx.base_);
         return *this;
     }
     extent& operator-=(const index<N>& idx) __CPU__ __HC__ {
         base_.operator-=(idx.base_);
         return *this;
     }

     extent& operator+=(int value) __CPU__ __HC__ {
         base_.operator+=(value);
         return *this;
     }
     extent& operator-=(int value) __CPU__ __HC__ {
         base_.operator-=(value);
         return *this;
     }
     extent& operator*=(int value) __CPU__ __HC__ {
         base_.operator*=(value);
         return *this;
     }
     extent& operator/=(int value) __CPU__ __HC__ {
         base_.operator/=(value);
         return *this;
     }
     extent& operator%=(int value) __CPU__ __HC__ {
         base_.operator%=(value);
         return *this;
     }

     extent& operator++() __CPU__ __HC__ {
         base_.operator+=(1);
         return *this;
     }
     extent operator++(int) __CPU__ __HC__ {
         extent ret = *this;
         base_.operator+=(1);
         return ret;
     }
     extent& operator--() __CPU__ __HC__ {
         base_.operator-=(1);
         return *this;
     }
     extent operator--(int) __CPU__ __HC__ {
         extent ret = *this;
         base_.operator-=(1);
         return ret;
     }

 private:
     typedef Kalmar::index_impl<typename Kalmar::__make_indices<N>::type> base;
     base base_;
     template <int K, typename Q> friend struct Kalmar::index_helper;
     template <int K, typename Q1, typename Q2> friend struct Kalmar::amp_helper;
 };

 // ------------------------------------------------------------------------
 // global functions for extent
 // ------------------------------------------------------------------------

 // FIXME: the signature is not entirely the same as defined in:
 //        C++AMP spec v1.2 #1253
 template <int N>
 extent<N> operator+(const extent<N>& lhs, const extent<N>& rhs) __CPU__ __HC__ {
     extent<N> __r = lhs;
     __r += rhs;
     return __r;
 }
 template <int N>
 extent<N> operator-(const extent<N>& lhs, const extent<N>& rhs) __CPU__ __HC__ {
     extent<N> __r = lhs;
     __r -= rhs;
     return __r;
 }

 // FIXME: the signature is not entirely the same as defined in:
 //        C++AMP spec v1.2 #1259
 template <int N>
 extent<N> operator+(const extent<N>& ext, int value) __CPU__ __HC__ {
     extent<N> __r = ext;
     __r += value;
     return __r;
 }
 template <int N>
 extent<N> operator+(int value, const extent<N>& ext) __CPU__ __HC__ {
     extent<N> __r = ext;
     __r += value;
     return __r;
 }
 template <int N>
 extent<N> operator-(const extent<N>& ext, int value) __CPU__ __HC__ {
     extent<N> __r = ext;
     __r -= value;
     return __r;
 }
 template <int N>
 extent<N> operator-(int value, const extent<N>& ext) __CPU__ __HC__ {
     extent<N> __r(value);
     __r -= ext;
     return __r;
 }
 template <int N>
 extent<N> operator*(const extent<N>& ext, int value) __CPU__ __HC__ {
     extent<N> __r = ext;
     __r *= value;
     return __r;
 }
 template <int N>
 extent<N> operator*(int value, const extent<N>& ext) __CPU__ __HC__ {
     extent<N> __r = ext;
     __r *= value;
     return __r;
 }
 template <int N>
 extent<N> operator/(const extent<N>& ext, int value) __CPU__ __HC__ {
     extent<N> __r = ext;
     __r /= value;
     return __r;
 }
 template <int N>
 extent<N> operator/(int value, const extent<N>& ext) __CPU__ __HC__ {
     extent<N> __r(value);
     __r /= ext;
     return __r;
 }
 template <int N>
 extent<N> operator%(const extent<N>& ext, int value) __CPU__ __HC__ {
     extent<N> __r = ext;
     __r %= value;
     return __r;
 }
 template <int N>
 extent<N> operator%(int value, const extent<N>& ext) __CPU__ __HC__ {
     extent<N> __r(value);
     __r %= ext;
     return __r;
 }

 // ------------------------------------------------------------------------
 // tiled_extent
 // ------------------------------------------------------------------------

 template <int N>
 class tiled_extent : public extent<N> {
 public:
     static const int rank = N;

     int tile_dim[N];

     tiled_extent() __CPU__ __HC__ : extent<N>(), tile_dim{0} {}

     tiled_extent(const tiled_extent& other) __CPU__ __HC__ : extent<N>(other) {
       for (int i = 0; i < N; ++i) {
         tile_dim[i] = other.tile_dim[i];
       }
     }
 };

 template <>
 class tiled_extent<1> : public extent<1> {
 private:
     unsigned int dynamic_group_segment_size;

 public:
     static const int rank = 1;

     int tile_dim[1];

     tiled_extent() __CPU__ __HC__ : extent(0), dynamic_group_segment_size(0), tile_dim{0} {}

     tiled_extent(int e0, int t0) __CPU__ __HC__ : extent(e0), dynamic_group_segment_size(0), tile_dim{t0} {}

     tiled_extent(int e0, int t0, int size) __CPU__ __HC__ : extent(e0), dynamic_group_segment_size(size), tile_dim{t0} {}

     tiled_extent(const tiled_extent<1>& other) __CPU__ __HC__ : extent(other[0]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0]} {}


     tiled_extent(const extent<1>& ext, int t0) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0} {}

     tiled_extent(const extent<1>& ext, int t0, int size) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(size), tile_dim{t0} {}

     void set_dynamic_group_segment_size(unsigned int size) __CPU__ {
         dynamic_group_segment_size = size;
     }

     unsigned int get_dynamic_group_segment_size() const __CPU__ {
         return dynamic_group_segment_size;
     }
 };

 template <>
 class tiled_extent<2> : public extent<2> {
 private:
     unsigned int dynamic_group_segment_size;

 public:
     static const int rank = 2;

     int tile_dim[2];

     tiled_extent() __CPU__ __HC__ : extent(0, 0), dynamic_group_segment_size(0), tile_dim{0, 0} {}

     tiled_extent(int e0, int e1, int t0, int t1) __CPU__ __HC__ : extent(e0, e1), dynamic_group_segment_size(0), tile_dim{t0, t1} {}

     tiled_extent(int e0, int e1, int t0, int t1, int size) __CPU__ __HC__ : extent(e0, e1), dynamic_group_segment_size(size), tile_dim{t0, t1} {}

     tiled_extent(const tiled_extent<2>& other) __CPU__ __HC__ : extent(other[0], other[1]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0], other.tile_dim[1]} {}

     tiled_extent(const extent<2>& ext, int t0, int t1) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0, t1} {}

     tiled_extent(const extent<2>& ext, int t0, int t1, int size) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(size), tile_dim{t0, t1} {}

     void set_dynamic_group_segment_size(unsigned int size) __CPU__ {
         dynamic_group_segment_size = size;
     }

     unsigned int get_dynamic_group_segment_size() const __CPU__ {
         return dynamic_group_segment_size;
     }
 };

 template <>
 class tiled_extent<3> : public extent<3> {
 private:
     unsigned int dynamic_group_segment_size;

 public:
     static const int rank = 3;

     int tile_dim[3];

     tiled_extent() __CPU__ __HC__ : extent(0, 0, 0), dynamic_group_segment_size(0), tile_dim{0, 0, 0} {}

     tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2) __CPU__ __HC__ : extent(e0, e1, e2), dynamic_group_segment_size(0), tile_dim{t0, t1, t2} {}

     tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2, int size) __CPU__ __HC__ : extent(e0, e1, e2), dynamic_group_segment_size(size), tile_dim{t0, t1, t2} {}

     tiled_extent(const tiled_extent<3>& other) __CPU__ __HC__ : extent(other[0], other[1], other[2]), dynamic_group_segment_size(other.dynamic_group_segment_size), tile_dim{other.tile_dim[0], other.tile_dim[1], other.tile_dim[2]} {}

     tiled_extent(const extent<3>& ext, int t0, int t1, int t2) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(0), tile_dim{t0, t1, t2} {}

     tiled_extent(const extent<3>& ext, int t0, int t1, int t2, int size) __CPU__ __HC__ : extent(ext), dynamic_group_segment_size(size), tile_dim{t0, t1, t2} {}

     void set_dynamic_group_segment_size(unsigned int size) __CPU__ {
         dynamic_group_segment_size = size;
     }

     unsigned int get_dynamic_group_segment_size() const __CPU__ {
         return dynamic_group_segment_size;
     }
 };

 // ------------------------------------------------------------------------
 // implementation of extent<N>::tile()
 // ------------------------------------------------------------------------

 template <int N>
 inline
 tiled_extent<1> extent<N>::tile(int t0) const __CPU__ __HC__ {
   static_assert(N == 1, "One-dimensional tile() method only available on extent<1>");
   return tiled_extent<1>(*this, t0);
 }

 template <int N>
 inline
 tiled_extent<2> extent<N>::tile(int t0, int t1) const __CPU__ __HC__ {
   static_assert(N == 2, "Two-dimensional tile() method only available on extent<2>");
   return tiled_extent<2>(*this, t0, t1);
 }

 template <int N>
 inline
 tiled_extent<3> extent<N>::tile(int t0, int t1, int t2) const __CPU__ __HC__ {
   static_assert(N == 3, "Three-dimensional tile() method only available on extent<3>");
   return tiled_extent<3>(*this, t0, t1, t2);
 }

 // ------------------------------------------------------------------------
 // implementation of extent<N>::tile_with_dynamic()
 // ------------------------------------------------------------------------

 template <int N>
 inline
 tiled_extent<1> extent<N>::tile_with_dynamic(int t0, int dynamic_size) const __CPU__ __HC__ {
   static_assert(N == 1, "One-dimensional tile() method only available on extent<1>");
   return tiled_extent<1>(*this, t0, dynamic_size);
 }

 template <int N>
 inline
 tiled_extent<2> extent<N>::tile_with_dynamic(int t0, int t1, int dynamic_size) const __CPU__ __HC__ {
   static_assert(N == 2, "Two-dimensional tile() method only available on extent<2>");
   return tiled_extent<2>(*this, t0, t1, dynamic_size);
 }

 template <int N>
 inline
 tiled_extent<3> extent<N>::tile_with_dynamic(int t0, int t1, int t2, int dynamic_size) const __CPU__ __HC__ {
   static_assert(N == 3, "Three-dimensional tile() method only available on extent<3>");
   return tiled_extent<3>(*this, t0, t1, t2, dynamic_size);
 }

 // ------------------------------------------------------------------------
 // Intrinsic functions for HSAIL instructions
 // ------------------------------------------------------------------------

 #define __HSA_WAVEFRONT_SIZE__ (64)
 extern "C" unsigned int __wavesize() __HC__;


 #if __hcc_backend__==HCC_BACKEND_AMDGPU
 extern "C" inline unsigned int __wavesize() __HC__ {
   return __HSA_WAVEFRONT_SIZE__;
 }
 #endif

 extern "C" inline unsigned int __popcount_u32_b32(unsigned int input) __HC__ {
   return __builtin_popcount(input);
 }

 extern "C" inline unsigned int __popcount_u32_b64(unsigned long long int input) __HC__ {
   return __builtin_popcountl(input);
 }

 extern "C" inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__ {
   return (src0 << (32 - src1 - src2)) >> (32 - src2);
 }

 extern "C" uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" int __bitextract_s32(int src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" int64_t __bitextract_s64(int64_t src0, unsigned int src1, unsigned int src2) __HC__;
 extern "C" unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) __HC__;

 extern "C" uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) __HC__;

 extern "C" int __bitinsert_s32(int src0, int src1, unsigned int src2, unsigned int src3) __HC__;

 extern "C" int64_t __bitinsert_s64(int64_t src0, int64_t src1, unsigned int src2, unsigned int src3) __HC__;
 extern "C" unsigned int __bitmask_b32(unsigned int src0, unsigned int src1) __HC__;

 extern "C" uint64_t __bitmask_b64(unsigned int src0, unsigned int src1) __HC__;
 unsigned int __bitrev_b32(unsigned int src0) [[hc]] __asm("llvm.bitreverse.i32");

 uint64_t __bitrev_b64(uint64_t src0) [[hc]] __asm("llvm.bitreverse.i64");

 extern "C" unsigned int __bitselect_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" uint64_t __bitselect_b64(uint64_t src0, uint64_t src1, uint64_t src2) __HC__;
 extern "C" inline unsigned int __firstbit_u32_u32(unsigned int input) __HC__ {
   return input == 0 ? -1 : __builtin_clz(input);
 }


 extern "C" inline unsigned int __firstbit_u32_u64(unsigned long long int input) __HC__ {
   return input == 0 ? -1 : __builtin_clzl(input);
 }

 extern "C" inline unsigned int __firstbit_u32_s32(int input) __HC__ {
   if (input == 0) {
     return -1;
   }

   return input > 0 ? __firstbit_u32_u32(input) : __firstbit_u32_u32(~input);
 }


 extern "C" inline unsigned int __firstbit_u32_s64(long long int input) __HC__ {
   if (input == 0) {
     return -1;
   }

   return input > 0 ? __firstbit_u32_u64(input) : __firstbit_u32_u64(~input);
 }

 extern "C" inline unsigned int __lastbit_u32_u32(unsigned int input) __HC__ {
   return input == 0 ? -1 : __builtin_ctz(input);
 }

 extern "C" inline unsigned int __lastbit_u32_u64(unsigned long long int input) __HC__ {
   return input == 0 ? -1 : __builtin_ctzl(input);
 }

 extern "C" inline unsigned int __lastbit_u32_s32(int input) __HC__ {
   return __lastbit_u32_u32(input);
 }

 extern "C" inline unsigned int __lastbit_u32_s64(unsigned long long input) __HC__ {
   return __lastbit_u32_u64(input);
 }
 extern "C" unsigned int __unpacklo_u8x4(unsigned int src0, unsigned int src1) __HC__;

 extern "C" uint64_t __unpacklo_u8x8(uint64_t src0, uint64_t src1) __HC__;

 extern "C" unsigned int __unpacklo_u16x2(unsigned int src0, unsigned int src1) __HC__;

 extern "C" uint64_t __unpacklo_u16x4(uint64_t src0, uint64_t src1) __HC__;

 extern "C" uint64_t __unpacklo_u32x2(uint64_t src0, uint64_t src1) __HC__;

 extern "C" int __unpacklo_s8x4(int src0, int src1) __HC__;

 extern "C" int64_t __unpacklo_s8x8(int64_t src0, int64_t src1) __HC__;

 extern "C" int __unpacklo_s16x2(int src0, int src1) __HC__;

 extern "C" int64_t __unpacklo_s16x4(int64_t src0, int64_t src1) __HC__;

 extern "C" int64_t __unpacklo_s32x2(int64_t src0, int64_t src1) __HC__;
 extern "C" unsigned int __unpackhi_u8x4(unsigned int src0, unsigned int src1) __HC__;

 extern "C" uint64_t __unpackhi_u8x8(uint64_t src0, uint64_t src1) __HC__;

 extern "C" unsigned int __unpackhi_u16x2(unsigned int src0, unsigned int src1) __HC__;

 extern "C" uint64_t __unpackhi_u16x4(uint64_t src0, uint64_t src1) __HC__;

 extern "C" uint64_t __unpackhi_u32x2(uint64_t src0, uint64_t src1) __HC__;

 extern "C" int __unpackhi_s8x4(int src0, int src1) __HC__;

 extern "C" int64_t __unpackhi_s8x8(int64_t src0, int64_t src1) __HC__;

 extern "C" int __unpackhi_s16x2(int src0, int src1) __HC__;

 extern "C" int64_t __unpackhi_s16x4(int64_t src0, int64_t src1) __HC__;

 extern "C" int64_t __unpackhi_s32x2(int64_t src0, int64_t src1) __HC__;
 extern "C" unsigned int __pack_u8x4_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" uint64_t __pack_u8x8_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" unsigned __pack_u16x2_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" uint64_t __pack_u16x4_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" uint64_t __pack_u32x2_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" int __pack_s8x4_s32(int src0, int src1, unsigned int src2) __HC__;

 extern "C" int64_t __pack_s8x8_s32(int64_t src0, int src1, unsigned int src2) __HC__;

 extern "C" int __pack_s16x2_s32(int src0, int src1, unsigned int src2) __HC__;

 extern "C" int64_t __pack_s16x4_s32(int64_t src0, int src1, unsigned int src2) __HC__;

 extern "C" int64_t __pack_s32x2_s32(int64_t src0, int src1, unsigned int src2) __HC__;

 extern "C" double __pack_f32x2_f32(double src0, float src1, unsigned int src2) __HC__;
 extern "C" unsigned int __unpack_u32_u8x4(unsigned int src0, unsigned int src1) __HC__;

 extern "C" unsigned int __unpack_u32_u8x8(uint64_t src0, unsigned int src1) __HC__;

 extern "C" unsigned int __unpack_u32_u16x2(unsigned int src0, unsigned int src1) __HC__;

 extern "C" unsigned int __unpack_u32_u16x4(uint64_t src0, unsigned int src1) __HC__;

 extern "C" unsigned int __unpack_u32_u32x2(uint64_t src0, unsigned int src1) __HC__;

 extern "C" int __unpack_s32_s8x4(int src0, unsigned int src1) __HC__;

 extern "C" int __unpack_s32_s8x8(int64_t src0, unsigned int src1) __HC__;

 extern "C" int __unpack_s32_s16x2(int src0, unsigned int src1) __HC__;

 extern "C" int __unpack_s32_s16x4(int64_t src0, unsigned int src1) __HC__;

 extern "C" int __unpack_s32_s3x2(int64_t src0, unsigned int src1) __HC__;

 extern "C" float __unpack_f32_f32x2(double src0, unsigned int src1) __HC__;
 extern "C" unsigned int __bitalign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" unsigned int __bytealign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" unsigned int __lerp_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" unsigned int __packcvt_u8x4_f32(float src0, float src1, float src2, float src3) __HC__;

 extern "C" float __unpackcvt_f32_u8x4(unsigned int src0, unsigned int src1) __HC__;

 extern "C" unsigned int __sad_u32_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" unsigned int __sad_u32_u16x2(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" unsigned int __sad_u32_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;
 extern "C" unsigned int __sadhi_u16x2_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__;

 extern "C" uint64_t __clock_u64() __HC__;

 extern "C" uint64_t __cycle_u64() __HC__;

 extern "C" unsigned int __activelaneid_u32() __HC__;

 extern "C" uint64_t __activelanemask_v4_b64_b1(unsigned int input) __HC__;

 extern "C" inline unsigned int __activelanecount_u32_b1(unsigned int input) __HC__ {
  return  __popcount_u32_b64(__activelanemask_v4_b64_b1(input));
 }

 // ------------------------------------------------------------------------
 // Wavefront Vote Functions
 // ------------------------------------------------------------------------

 extern "C" inline int __any(int predicate) __HC__ {
     return __popcount_u32_b64(__activelanemask_v4_b64_b1(predicate));
 }

 extern "C" inline int __all(int predicate) __HC__ {
     return __popcount_u32_b64(__activelanemask_v4_b64_b1(predicate)) == __activelanecount_u32_b1(1);
 }

 extern "C" inline uint64_t __ballot(int predicate) __HC__ {
     return __activelanemask_v4_b64_b1(predicate);
 }

 // ------------------------------------------------------------------------
 // Wavefront Shuffle Functions
 // ------------------------------------------------------------------------

 // utility union type
 union __u {
     int i;
     unsigned int u;
     float f;
 };

 #if __hcc_backend__==HCC_BACKEND_AMDGPU

 /*
  * FIXME: We need to add __builtin_amdgcn_mbcnt_{lo,hi} to clang and call
  * them here instead.
  */

 int __amdgcn_mbcnt_lo(int mask, int src) [[hc]] __asm("llvm.amdgcn.mbcnt.lo");
 int __amdgcn_mbcnt_hi(int mask, int src) [[hc]] __asm("llvm.amdgcn.mbcnt.hi");

 inline int __lane_id(void) [[hc]] {
   int lo = __amdgcn_mbcnt_lo(-1, 0);
   return __amdgcn_mbcnt_hi(-1, lo);
 }

 #endif

 #if __hcc_backend__==HCC_BACKEND_AMDGPU

 int __amdgcn_ds_bpermute(int index, int src) [[hc]] __asm("llvm.amdgcn.ds.bpermute");
 inline unsigned int __amdgcn_ds_bpermute(int index, unsigned int src) [[hc]] {
   __u tmp; tmp.u = src;
   tmp.i = __amdgcn_ds_bpermute(index, tmp.i);
   return tmp.u;
 }
 inline float __amdgcn_ds_bpermute(int index, float src) [[hc]] {
   __u tmp; tmp.f = src;
   tmp.i = __amdgcn_ds_bpermute(index, tmp.i);
   return tmp.f;
 }

 extern "C" int __amdgcn_ds_permute(int index, int src) [[hc]];
 inline unsigned int __amdgcn_ds_permute(int index, unsigned int src) [[hc]] {
   __u tmp; tmp.u = src;
   tmp.i = __amdgcn_ds_permute(index, tmp.i);
   return tmp.u;
 }
 inline float __amdgcn_ds_permute(int index, float src) [[hc]] {
   __u tmp; tmp.f = src;
   tmp.i = __amdgcn_ds_permute(index, tmp.i);
   return tmp.f;
 }


 extern "C" int __amdgcn_ds_swizzle(int src, int pattern) [[hc]];
 inline unsigned int __amdgcn_ds_swizzle(unsigned int src, int pattern) [[hc]] {
   __u tmp; tmp.u = src;
   tmp.i = __amdgcn_ds_swizzle(tmp.i, pattern);
   return tmp.u;
 }
 inline float __amdgcn_ds_swizzle(float src, int pattern) [[hc]] {
   __u tmp; tmp.f = src;
   tmp.i = __amdgcn_ds_swizzle(tmp.i, pattern);
   return tmp.f;
 }


 extern "C" int __amdgcn_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl) [[hc]];

 extern "C" int __amdgcn_wave_sr1(int src, bool bound_ctrl) [[hc]];
 inline unsigned int __amdgcn_wave_sr1(unsigned int src, bool bound_ctrl) [[hc]] {
   __u tmp; tmp.u = src;
   tmp.i = __amdgcn_wave_sr1(tmp.i, bound_ctrl);
   return tmp.u;
 }
 inline float __amdgcn_wave_sr1(float src, bool bound_ctrl) [[hc]] {
   __u tmp; tmp.f = src;
   tmp.i = __amdgcn_wave_sr1(tmp.i, bound_ctrl);
   return tmp.f;
 }

 extern "C" int __amdgcn_wave_sl1(int src, bool bound_ctrl) [[hc]];
 inline unsigned int __amdgcn_wave_sl1(unsigned int src, bool bound_ctrl) [[hc]] {
   __u tmp; tmp.u = src;
   tmp.i = __amdgcn_wave_sl1(tmp.i, bound_ctrl);
   return tmp.u;
 }
 inline float __amdgcn_wave_sl1(float src, bool bound_ctrl) [[hc]] {
   __u tmp; tmp.f = src;
   tmp.i = __amdgcn_wave_sl1(tmp.i, bound_ctrl);
   return tmp.f;
 }


 extern "C" int __amdgcn_wave_rr1(int src) [[hc]];
 inline unsigned int __amdgcn_wave_rr1(unsigned int src) [[hc]] {
   __u tmp; tmp.u = src;
   tmp.i = __amdgcn_wave_rr1(tmp.i);
   return tmp.u;
 }
 inline float __amdgcn_wave_rr1(float src) [[hc]] {
   __u tmp; tmp.f = src;
   tmp.i = __amdgcn_wave_rr1(tmp.i);
   return tmp.f;
 }

 extern "C" int __amdgcn_wave_rl1(int src) [[hc]];
 inline unsigned int __amdgcn_wave_rl1(unsigned int src) [[hc]] {
   __u tmp; tmp.u = src;
   tmp.i = __amdgcn_wave_rl1(tmp.i);
   return tmp.u;
 }
 inline float __amdgcn_wave_rl1(float src) [[hc]] {
   __u tmp; tmp.f = src;
   tmp.i = __amdgcn_wave_rl1(tmp.i);
   return tmp.f;
 }

 #endif

 /* definition to expand macro then apply to pragma message
 #define VALUE_TO_STRING(x) #x
 #define VALUE(x) VALUE_TO_STRING(x)
 #define VAR_NAME_VALUE(var) #var "="  VALUE(var)
 #pragma message(VAR_NAME_VALUE(__hcc_backend__))
 */

 #if __hcc_backend__==HCC_BACKEND_AMDGPU

 inline int __shfl(int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
   int self = __lane_id();
   int index = srcLane + (self & ~(width-1));
   return __amdgcn_ds_bpermute(index<<2, var);
 }

 #endif

 inline unsigned int __shfl(unsigned int var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
      __u tmp; tmp.u = var;
     tmp.i = __shfl(tmp.i, srcLane, width);
     return tmp.u;
 }


 inline float __shfl(float var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
     __u tmp; tmp.f = var;
     tmp.i = __shfl(tmp.i, srcLane, width);
     return tmp.f;
 }

 // FIXME: support half type
 #if __hcc_backend__==HCC_BACKEND_AMDGPU

 inline int __shfl_up(int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
   int self = __lane_id();
   int index = self - delta;
   index = (index < (self & ~(width-1)))?self:index;
   return __amdgcn_ds_bpermute(index<<2, var);
 }

 #endif

 inline unsigned int __shfl_up(unsigned int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
     __u tmp; tmp.u = var;
     tmp.i = __shfl_up(tmp.i, delta, width);
     return tmp.u;
 }

 inline float __shfl_up(float var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
     __u tmp; tmp.f = var;
     tmp.i = __shfl_up(tmp.i, delta, width);
     return tmp.f;
 }

 // FIXME: support half type
 #if __hcc_backend__==HCC_BACKEND_AMDGPU

 inline int __shfl_down(int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
   int self = __lane_id();
   int index = self + delta;
   index = (int)((self&(width-1))+delta) >= width?self:index;
   return __amdgcn_ds_bpermute(index<<2, var);
 }

 #endif

 inline unsigned int __shfl_down(unsigned int var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
     __u tmp; tmp.u = var;
     tmp.i = __shfl_down(tmp.i, delta, width);
     return tmp.u;
 }

 inline float __shfl_down(float var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
     __u tmp; tmp.f = var;
     tmp.i = __shfl_down(tmp.i, delta, width);
     return tmp.f;
 }


 // FIXME: support half type
 #if __hcc_backend__==HCC_BACKEND_AMDGPU


 inline int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
   int self = __lane_id();
   int index = self^laneMask;
   index = index >= ((self+width)&~(width-1))?self:index;
   return __amdgcn_ds_bpermute(index<<2, var);
 }

 #endif

 inline float __shfl_xor(float var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
     __u tmp; tmp.f = var;
     tmp.i = __shfl_xor(tmp.i, laneMask, width);
     return tmp.f;
 }

 // FIXME: support half type
 inline unsigned int __shfl_xor(unsigned int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__ {
     __u tmp; tmp.u = var;
     tmp.i = __shfl_xor(tmp.i, laneMask, width);
     return tmp.u;
 }

 inline unsigned int __mul24(unsigned int x, unsigned int y) [[hc]] {
   return (x & 0x00FFFFFF) * (y & 0x00FFFFFF);
 }

 inline int __mul24(int x, int y) [[hc]] {
   return  ((x << 8) >> 8) * ((y << 8) >> 8);
 }

 inline unsigned int __mad24(unsigned int x, unsigned int y, unsigned int z) [[hc]] {
   return __mul24(x,y) + z;
 }

 inline int __mad24(int x, int y, int z) [[hc]] {
   return __mul24(x,y) + z;
 }

 inline void abort() __HC__ {
   __builtin_trap();
 }

 // ------------------------------------------------------------------------
 // group segment
 // ------------------------------------------------------------------------

 extern "C" unsigned int get_group_segment_size() __HC__;

 extern "C" unsigned int get_static_group_segment_size() __HC__;

 extern "C" void* get_group_segment_base_pointer() __HC__;

 extern "C" void* get_dynamic_group_segment_base_pointer() __HC__;

 // ------------------------------------------------------------------------
 // utility class for tiled_barrier
 // ------------------------------------------------------------------------

 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
 template <typename Ker, typename Ti>
 void bar_wrapper(Ker *f, Ti *t)
 {
     (*f)(*t);
 }

 struct barrier_t {
     std::unique_ptr<ucontext_t[]> ctx;
     int idx;
     barrier_t (int a) :
         ctx(new ucontext_t[a + 1]) {}
     template <typename Ti, typename Ker>
     void setctx(int x, char *stack, Ker& f, Ti* tidx, int S) {
         getcontext(&ctx[x]);
         ctx[x].uc_stack.ss_sp = stack;
         ctx[x].uc_stack.ss_size = S;
         ctx[x].uc_link = &ctx[x - 1];
         makecontext(&ctx[x], (void (*)(void))bar_wrapper<Ker, Ti>, 2, &f, tidx);
     }
     void swap(int a, int b) {
         swapcontext(&ctx[a], &ctx[b]);
     }
     void wait() __HC__ {
         --idx;
         swapcontext(&ctx[idx + 1], &ctx[idx]);
     }
 };
 #endif


 // ------------------------------------------------------------------------
 // tiled_barrier
 // ------------------------------------------------------------------------

 class tile_barrier {
 public:
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     using pb_t = std::shared_ptr<barrier_t>;
     tile_barrier(pb_t pb) : pbar(pb) {}

     tile_barrier(const tile_barrier& other) __CPU__ __HC__ : pbar(other.pbar) {}
 #else

     tile_barrier(const tile_barrier& other) __CPU__ __HC__ {}
 #endif

     void wait() const __HC__ {
 #if __KALMAR_ACCELERATOR__ == 1
         wait_with_all_memory_fence();
 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
         pbar->wait();
 #endif
     }

     void wait_with_all_memory_fence() const __HC__ {
 #if __KALMAR_ACCELERATOR__ == 1
         amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
         pbar->wait();
 #endif
     }

     void wait_with_global_memory_fence() const __HC__ {
 #if __KALMAR_ACCELERATOR__ == 1
         amp_barrier(CLK_GLOBAL_MEM_FENCE);
 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
         pbar->wait();
 #endif
     }

     void wait_with_tile_static_memory_fence() const __HC__ {
 #if __KALMAR_ACCELERATOR__ == 1
         amp_barrier(CLK_LOCAL_MEM_FENCE);
 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
         pbar->wait();
 #endif
     }

 private:
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     tile_barrier() __CPU__ __HC__ = default;
     pb_t pbar;
 #else
     tile_barrier() __HC__ {}
 #endif

     template <int N> friend
         class tiled_index;
 };

 // ------------------------------------------------------------------------
 // other memory fences
 // ------------------------------------------------------------------------

 // FIXME: this functions has not been implemented.
 void all_memory_fence(const tile_barrier&) __HC__;

 // FIXME: this functions has not been implemented.
 void global_memory_fence(const tile_barrier&) __HC__;

 // FIXME: this functions has not been implemented.
 void tile_static_memory_fence(const tile_barrier&) __HC__;

 // ------------------------------------------------------------------------
 // tiled_index
 // ------------------------------------------------------------------------

 template <int N=3>
 class tiled_index {
 public:
     static const int rank = 3;

     tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}

     const index<3> global;

     const index<3> local;

     const index<3> tile;

     const index<3> tile_origin;

     const tile_barrier barrier;

     const index<3> tile_dim;

     operator const index<3>() const __CPU__ __HC__ {
         return global;
     }

     tiled_index(const index<3>& g) __CPU__ __HC__ : global(g) {}

 private:
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     __attribute__((always_inline)) tiled_index(int a0, int a1, int a2, int b0, int b1, int b2, int c0, int c1, int c2, tile_barrier& pb, int D0, int D1, int D2) __CPU__ __HC__
         : global(a2, a1, a0), local(b2, b1, b0), tile(c2, c1, c0), tile_origin(a2 - b2, a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1, D2) {}
 #endif

     __attribute__((annotate("__cxxamp_opencl_index")))
 #if __KALMAR_ACCELERATOR__ == 1
     __attribute__((always_inline)) tiled_index() __HC__
         : global(index<3>(amp_get_global_id(2), amp_get_global_id(1), amp_get_global_id(0))),
           local(index<3>(amp_get_local_id(2), amp_get_local_id(1), amp_get_local_id(0))),
           tile(index<3>(amp_get_group_id(2), amp_get_group_id(1), amp_get_group_id(0))),
           tile_origin(index<3>(amp_get_global_id(2) - amp_get_local_id(2),
                                amp_get_global_id(1) - amp_get_local_id(1),
                                amp_get_global_id(0) - amp_get_local_id(0))),
           tile_dim(index<3>(amp_get_local_size(2), amp_get_local_size(1), amp_get_local_size(0)))
 #elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     __attribute__((always_inline)) tiled_index() __CPU__ __HC__
 #else
     __attribute__((always_inline)) tiled_index() __HC__
 #endif // __KALMAR_ACCELERATOR__
     {}

     template<typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const tiled_extent<N>&, const Kernel&);

 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     template<typename K> friend
         void partitioned_task_tile_3D(K const&, tiled_extent<3> const&, int);
 #endif
 };


 template<>
 class tiled_index<1> {
 public:
     static const int rank = 1;

     tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}

     const index<1> global;

     const index<1> local;

     const index<1> tile;

     const index<1> tile_origin;

     const tile_barrier barrier;

     const index<1> tile_dim;

     operator const index<1>() const __CPU__ __HC__ {
         return global;
     }

     tiled_index(const index<1>& g) __CPU__ __HC__ : global(g) {}

 private:
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     __attribute__((always_inline)) tiled_index(int a, int b, int c, tile_barrier& pb, int D0) __CPU__ __HC__
         : global(a), local(b), tile(c), tile_origin(a - b), barrier(pb), tile_dim(D0) {}
 #endif

     __attribute__((annotate("__cxxamp_opencl_index")))
 #if __KALMAR_ACCELERATOR__ == 1
     __attribute__((always_inline)) tiled_index() __HC__
         : global(index<1>(amp_get_global_id(0))),
           local(index<1>(amp_get_local_id(0))),
           tile(index<1>(amp_get_group_id(0))),
           tile_origin(index<1>(amp_get_global_id(0) - amp_get_local_id(0))),
           tile_dim(index<1>(amp_get_local_size(0)))
 #elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     __attribute__((always_inline)) tiled_index() __CPU__ __HC__
 #else
     __attribute__((always_inline)) tiled_index() __HC__
 #endif // __KALMAR_ACCELERATOR__
     {}

     template<typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&);

 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     template<typename K> friend
         void partitioned_task_tile_1D(K const&, tiled_extent<1> const&, int);
 #endif
 };

 template<>
 class tiled_index<2> {
 public:
     static const int rank = 2;

     tiled_index(const tiled_index& other) __CPU__ __HC__ : global(other.global), local(other.local), tile(other.tile), tile_origin(other.tile_origin), barrier(other.barrier), tile_dim(other.tile_dim) {}

     const index<2> global;

     const index<2> local;

     const index<2> tile;

     const index<2> tile_origin;

     const tile_barrier barrier;

     const index<2> tile_dim;

     operator const index<2>() const __CPU__ __HC__ {
       return global;
     }

     tiled_index(const index<2>& g) __CPU__ __HC__ : global(g) {}

 private:
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     __attribute__((always_inline)) tiled_index(int a0, int a1, int b0, int b1, int c0, int c1, tile_barrier& pb, int D0, int D1) __CPU__ __HC__
         : global(a1, a0), local(b1, b0), tile(c1, c0), tile_origin(a1 - b1, a0 - b0), barrier(pb), tile_dim(D0, D1) {}
 #endif

     __attribute__((annotate("__cxxamp_opencl_index")))
 #if __KALMAR_ACCELERATOR__ == 1
     __attribute__((always_inline)) tiled_index() __HC__
         : global(index<2>(amp_get_global_id(1), amp_get_global_id(0))),
           local(index<2>(amp_get_local_id(1), amp_get_local_id(0))),
           tile(index<2>(amp_get_group_id(1), amp_get_group_id(0))),
           tile_origin(index<2>(amp_get_global_id(1) - amp_get_local_id(1),
                                amp_get_global_id(0) - amp_get_local_id(0))),
           tile_dim(index<2>(amp_get_local_size(1), amp_get_local_size(0)))
 #elif __KALMAR__ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     __attribute__((always_inline)) tiled_index() __CPU__ __HC__
 #else
     __attribute__((always_inline)) tiled_index() __HC__
 #endif // __KALMAR_ACCELERATOR__
     {}

     template<typename Kernel> friend
         completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&);

 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     template<typename K> friend
         void partitioned_task_tile_2D(K const&, tiled_extent<2> const&, int);
 #endif
 };

 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
 #define SSIZE 1024 * 10
 template <int N, typename Kernel,  int K>
 struct cpu_helper
 {
     static inline void call(const Kernel& k, index<K>& idx, const extent<K>& ext) __CPU__ __HC__ {
         int i;
         for (i = 0; i < ext[N]; ++i) {
             idx[N] = i;
             cpu_helper<N + 1, Kernel, K>::call(k, idx, ext);
         }
     }
 };
 template <typename Kernel, int K>
 struct cpu_helper<K, Kernel, K>
 {
     static inline void call(const Kernel& k, const index<K>& idx, const extent<K>& ext) __CPU__ __HC__ {
         (const_cast<Kernel&>(k))(idx);
     }
 };

 template <typename Kernel, int N>
 void partitioned_task(const Kernel& ker, const extent<N>& ext, int part) {
     index<N> idx;
     int start = ext[0] * part / Kalmar::NTHREAD;
     int end = ext[0] * (part + 1) / Kalmar::NTHREAD;
     for (int i = start; i < end; i++) {
         idx[0] = i;
         cpu_helper<1, Kernel, N>::call(ker, idx, ext);
     }
 }

 template <typename Kernel>
 void partitioned_task_tile_1D(Kernel const& f, tiled_extent<1> const& ext, int part) {
     int D0 = ext.tile_dim[0];
     int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
     int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
     int stride = end - start;
     if (stride == 0)
         return;
     char *stk = new char[D0 * SSIZE];
     tiled_index<1> *tidx = new tiled_index<1>[D0];
     tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0);
     tile_barrier tbar(hc_bar);
     for (int tx = start; tx < end; tx++) {
         int id = 0;
         char *sp = stk;
         tiled_index<1> *tip = tidx;
         for (int x = 0; x < D0; x++) {
             new (tip) tiled_index<1>(tx * D0 + x, x, tx, tbar, D0);
             hc_bar->setctx(++id, sp, f, tip, SSIZE);
             sp += SSIZE;
             ++tip;
         }
         hc_bar->idx = 0;
         while (hc_bar->idx == 0) {
             hc_bar->idx = id;
             hc_bar->swap(0, id);
         }
     }
     delete [] stk;
     delete [] tidx;
 }

 template <typename Kernel>
 void partitioned_task_tile_2D(Kernel const& f, tiled_extent<2> const& ext, int part) {
     int D0 = ext.tile_dim[0];
     int D1 = ext.tile_dim[1];
     int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
     int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
     int stride = end - start;
     if (stride == 0)
         return;
     char *stk = new char[D1 * D0 * SSIZE];
     tiled_index<2> *tidx = new tiled_index<2>[D0 * D1];
     tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0 * D1);
     tile_barrier tbar(hc_bar);

     for (int tx = 0; tx < ext[1] / D1; tx++)
         for (int ty = start; ty < end; ty++) {
             int id = 0;
             char *sp = stk;
             tiled_index<2> *tip = tidx;
             for (int x = 0; x < D1; x++)
                 for (int y = 0; y < D0; y++) {
                     new (tip) tiled_index<2>(D1 * tx + x, D0 * ty + y, x, y, tx, ty, tbar, D0, D1);
                     hc_bar->setctx(++id, sp, f, tip, SSIZE);
                     ++tip;
                     sp += SSIZE;
                 }
             hc_bar->idx = 0;
             while (hc_bar->idx == 0) {
                 hc_bar->idx = id;
                 hc_bar->swap(0, id);
             }
         }
     delete [] stk;
     delete [] tidx;
 }

 template <typename Kernel>
 void partitioned_task_tile_3D(Kernel const& f, tiled_extent<3> const& ext, int part) {
     int D0 = ext.tile_dim[0];
     int D1 = ext.tile_dim[1];
     int D2 = ext.tile_dim[2];
     int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
     int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
     int stride = end - start;
     if (stride == 0)
         return;
     char *stk = new char[D2 * D1 * D0 * SSIZE];
     tiled_index<3> *tidx = new tiled_index<3>[D0 * D1 * D2];
     tile_barrier::pb_t hc_bar = std::make_shared<barrier_t>(D0 * D1 * D2);
     tile_barrier tbar(hc_bar);

     for (int i = 0; i < ext[2] / D2; i++)
         for (int j = 0; j < ext[1] / D1; j++)
             for(int k = start; k < end; k++) {
                 int id = 0;
                 char *sp = stk;
                 tiled_index<3> *tip = tidx;
                 for (int x = 0; x < D2; x++)
                     for (int y = 0; y < D1; y++)
                         for (int z = 0; z < D0; z++) {
                             new (tip) tiled_index<3>(D2 * i + x,
                                                               D1 * j + y,
                                                               D0 * k + z,
                                                               x, y, z, i, j, k, tbar, D0, D1, D2);
                             hc_bar->setctx(++id, sp, f, tip, SSIZE);
                             ++tip;
                             sp += SSIZE;
                         }
                 hc_bar->idx = 0;
                 while (hc_bar->idx == 0) {
                     hc_bar->idx = id;
                     hc_bar->swap(0, id);
                 }
             }
     delete [] stk;
     delete [] tidx;
 }

 template <typename Kernel, int N>
 completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
                      extent<N> const& compute_domain)
 {
     Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
     for (int i = 0; i < Kalmar::NTHREAD; ++i)
         obj[i] = std::thread(partitioned_task<Kernel, N>, std::cref(f), std::cref(compute_domain), i);
     // FIXME wrap the above operation into the completion_future object
     return completion_future();
 }

 template <typename Kernel>
 completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
                      tiled_extent<1> const& compute_domain)
 {
     Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
     for (int i = 0; i < Kalmar::NTHREAD; ++i)
         obj[i] = std::thread(partitioned_task_tile_1D<Kernel>,
                              std::cref(f), std::cref(compute_domain), i);
     // FIXME wrap the above operation into the completion_future object
     return completion_future();
 }

 template <typename Kernel>
 completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
                      tiled_extent<2> const& compute_domain)
 {
     Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
     for (int i = 0; i < Kalmar::NTHREAD; ++i)
         obj[i] = std::thread(partitioned_task_tile_2D<Kernel>,
                              std::cref(f), std::cref(compute_domain), i);
     // FIXME wrap the above operation into the completion_future object
     return completion_future();
 }

 template <typename Kernel>
 completion_future launch_cpu_task_async(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
                      tiled_extent<3> const& compute_domain)
 {
     Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
     for (int i = 0; i < Kalmar::NTHREAD; ++i)
         obj[i] = std::thread(partitioned_task_tile_3D<Kernel>,
                              std::cref(f), std::cref(compute_domain), i);
     // FIXME wrap the above operation into the completion_future object
     return completion_future();
 }

 #endif

 // ------------------------------------------------------------------------
 // utility helper classes for array_view
 // ------------------------------------------------------------------------

 template <typename T, int N>
 struct projection_helper
 {
     // array_view<T,N>, where N>1
     //    array_view<T,N-1> operator[](int i) const __CPU__ __HC__
     static_assert(N > 1, "projection_helper is only supported on array_view with a rank of 2 or higher");
     typedef array_view<T, N - 1> result_type;
     static result_type project(array_view<T, N>& now, int stride) __CPU__ __HC__ {
         int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
         for (i = N - 1; i > 0; --i) {
             ext_o[i - 1] = now.extent[i];
             ext[i - 1] = now.extent_base[i];
             idx[i - 1] = now.index_base[i];
         }
         stride += now.index_base[0];
         extent<N - 1> ext_now(ext_o);
         extent<N - 1> ext_base(ext);
         index<N - 1> idx_base(idx);
         return result_type (now.cache, ext_now, ext_base, idx_base,
                             now.offset + ext_base.size() * stride);
     }
     static result_type project(const array_view<T, N>& now, int stride) __CPU__ __HC__ {
         int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
         for (i = N - 1; i > 0; --i) {
             ext_o[i - 1] = now.extent[i];
             ext[i - 1] = now.extent_base[i];
             idx[i - 1] = now.index_base[i];
         }
         stride += now.index_base[0];
         extent<N - 1> ext_now(ext_o);
         extent<N - 1> ext_base(ext);
         index<N - 1> idx_base(idx);
         return result_type (now.cache, ext_now, ext_base, idx_base,
                             now.offset + ext_base.size() * stride);
     }
 };

 template <typename T>
 struct projection_helper<T, 1>
 {
     // array_view<T,1>
     //      T& operator[](int i) const __CPU__ __HC__;
     typedef T& result_type;
     static result_type project(array_view<T, 1>& now, int i) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         now.cache.get_cpu_access(true);
 #endif
         T *ptr = reinterpret_cast<T *>(now.cache.get() + i + now.offset + now.index_base[0]);
         return *ptr;
     }
     static result_type project(const array_view<T, 1>& now, int i) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         now.cache.get_cpu_access(true);
 #endif
         T *ptr = reinterpret_cast<T *>(now.cache.get() + i + now.offset + now.index_base[0]);
         return *ptr;
     }
 };

 template <typename T, int N>
 struct projection_helper<const T, N>
 {
     // array_view<T,N>, where N>1
     //    array_view<const T,N-1> operator[](int i) const __CPU__ __HC__;
     static_assert(N > 1, "projection_helper is only supported on array_view with a rank of 2 or higher");
     typedef array_view<const T, N - 1> const_result_type;
     static const_result_type project(array_view<const T, N>& now, int stride) __CPU__ __HC__ {
         int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
         for (i = N - 1; i > 0; --i) {
             ext_o[i - 1] = now.extent[i];
             ext[i - 1] = now.extent_base[i];
             idx[i - 1] = now.index_base[i];
         }
         stride += now.index_base[0];
         extent<N - 1> ext_now(ext_o);
         extent<N - 1> ext_base(ext);
         index<N - 1> idx_base(idx);
         auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
                                       now.offset + ext_base.size() * stride);
         return ret;
     }
     static const_result_type project(const array_view<const T, N>& now, int stride) __CPU__ __HC__ {
         int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
         for (i = N - 1; i > 0; --i) {
             ext_o[i - 1] = now.extent[i];
             ext[i - 1] = now.extent_base[i];
             idx[i - 1] = now.index_base[i];
         }
         stride += now.index_base[0];
         extent<N - 1> ext_now(ext_o);
         extent<N - 1> ext_base(ext);
         index<N - 1> idx_base(idx);
         auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
                                       now.offset + ext_base.size() * stride);
         return ret;
     }
 };

 template <typename T>
 struct projection_helper<const T, 1>
 {
     // array_view<const T,1>
     //      const T& operator[](int i) const __CPU__ __HC__;
     typedef const T& const_result_type;
     static const_result_type project(array_view<const T, 1>& now, int i) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         now.cache.get_cpu_access();
 #endif
         const T *ptr = reinterpret_cast<const T *>(now.cache.get() + i + now.offset + now.index_base[0]);
         return *ptr;
     }
     static const_result_type project(const array_view<const T, 1>& now, int i) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         now.cache.get_cpu_access();
 #endif
         const T *ptr = reinterpret_cast<const T *>(now.cache.get() + i + now.offset + now.index_base[0]);
         return *ptr;
     }
 };

 // ------------------------------------------------------------------------
 // utility helper classes for array_view
 // ------------------------------------------------------------------------

 template <typename T>
 struct __has_data
 {
 private:
     struct two {char __lx; char __lxx;};
     template <typename C> static char test(decltype(std::declval<C>().data()));
     template <typename C> static two test(...);
 public:
     static const bool value = sizeof(test<T>(0)) == 1;
 };

 template <typename T>
 struct __has_size
 {
 private:
     struct two {char __lx; char __lxx;};
     template <typename C> static char test(decltype(&C::size));
     template <typename C> static two test(...);
 public:
     static const bool value = sizeof(test<T>(0)) == 1;
 };

 template <typename T>
 struct __is_container
 {
     using _T = typename std::remove_reference<T>::type;
     static const bool value = __has_size<_T>::value && __has_data<_T>::value;
 };


 // ------------------------------------------------------------------------
 // utility helper classes for array
 // ------------------------------------------------------------------------

 template <typename T, int N>
 struct array_projection_helper
 {
     // array<T,N>, where N>1
     //     array_view<T,N-1> operator[](int i0) __CPU__ __HC__;
     //     array_view<const T,N-1> operator[](int i0) const __CPU__ __HC__;
     static_assert(N > 1, "projection_helper is only supported on array with a rank of 2 or higher");
     typedef array_view<T, N - 1> result_type;
     typedef array_view<const T, N - 1> const_result_type;
     static result_type project(array<T, N>& now, int stride) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         if( stride < 0)
           throw runtime_exception("errorMsg_throw", 0);
 #endif
         int comp[N - 1], i;
         for (i = N - 1; i > 0; --i)
             comp[i - 1] = now.extent[i];
         extent<N - 1> ext(comp);
         int offset = ext.size() * stride;
 #if __KALMAR_ACCELERATOR__ != 1
         if( offset >= now.extent.size())
           throw runtime_exception("errorMsg_throw", 0);
 #endif
         return result_type(now.m_device, ext, ext, index<N - 1>(), offset);
     }
     static const_result_type project(const array<T, N>& now, int stride) __CPU__ __HC__ {
         int comp[N - 1], i;
         for (i = N - 1; i > 0; --i)
             comp[i - 1] = now.extent[i];
         extent<N - 1> ext(comp);
         int offset = ext.size() * stride;
         return const_result_type(now.m_device, ext, ext, index<N - 1>(), offset);
     }
 };

 template <typename T>
 struct array_projection_helper<T, 1>
 {
     // array<T,1>
     //    T& operator[](int i0) __CPU__ __HC__;
     //    const T& operator[](int i0) const __CPU__ __HC__;
     typedef T& result_type;
     typedef const T& const_result_type;
     static result_type project(array<T, 1>& now, int i) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         now.m_device.synchronize(true);
 #endif
         T *ptr = reinterpret_cast<T *>(now.m_device.get() + i);
         return *ptr;
     }
     static const_result_type project(const array<T, 1>& now, int i) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         now.m_device.synchronize();
 #endif
         const T *ptr = reinterpret_cast<const T *>(now.m_device.get() + i);
         return *ptr;
     }
 };

 template <int N>
 const extent<N>& check(const extent<N>& ext)
 {
 #if __KALMAR_ACCELERATOR__ != 1
     for (int i = 0; i < N; i++)
     {
         if(ext[i] <=0)
             throw runtime_exception("errorMsg_throw", 0);
     }
 #endif
     return ext;
 }

 // ------------------------------------------------------------------------
 // forward declarations of copy routines used by array / array_view
 // ------------------------------------------------------------------------

 template <typename T, int N>
 void copy(const array_view<const T, N>& src, const array_view<T, N>& dest);

 template <typename T, int N>
 void copy(const array_view<T, N>& src, const array_view<T, N>& dest);

 template <typename T, int N>
 void copy(const array<T, N>& src, const array_view<T, N>& dest);

 template <typename T, int N>
 void copy(const array<T, N>& src, array<T, N>& dest);

 template <typename T, int N>
 void copy(const array_view<const T, N>& src, array<T, N>& dest);

 template <typename T, int N>
 void copy(const array_view<T, N>& src, array<T, N>& dest);

 template <typename InputIter, typename T, int N>
 void copy(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest);

 template <typename InputIter, typename T, int N>
 void copy(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest);

 template <typename InputIter, typename T, int N>
 void copy(InputIter srcBegin, const array_view<T, N>& dest);

 template <typename InputIter, typename T, int N>
 void copy(InputIter srcBegin, array<T, N>& dest);

 template <typename OutputIter, typename T, int N>
 void copy(const array_view<T, N> &src, OutputIter destBegin);

 template <typename OutputIter, typename T, int N>
 void copy(const array<T, N> &src, OutputIter destBegin);

 // ------------------------------------------------------------------------
 // array
 // ------------------------------------------------------------------------

 template <typename T, int N = 1>
 class array {
     static_assert(!std::is_const<T>::value, "array<const T> is not supported");
 public:
 #if __KALMAR_ACCELERATOR__ == 1
     typedef Kalmar::_data<T> acc_buffer_t;
 #else
     typedef Kalmar::_data_host<T> acc_buffer_t;
 #endif

     static const int rank = N;

     typedef T value_type;

     array() = delete;

     array(const array& other)
         : array(other.get_extent(), other.get_accelerator_view())
     { copy(other, *this); }

     array(array&& other)
         : m_device(other.m_device), extent(other.extent)
     { other.m_device.reset(); }

     explicit array(const extent<N>& ext)
         : array(ext, accelerator(L"default").get_default_view()) {}

     explicit array(int e0)
         : array(hc::extent<N>(e0)) { static_assert(N == 1, "illegal"); }
     explicit array(int e0, int e1)
         : array(hc::extent<N>(e0, e1)) {}
     explicit array(int e0, int e1, int e2)
         : array(hc::extent<N>(e0, e1, e2)) {}

     template <typename InputIter>
         array(const extent<N>& ext, InputIter srcBegin)
             : array(ext, srcBegin, accelerator(L"default").get_default_view()) {}
     template <typename InputIter>
         array(const extent<N>& ext, InputIter srcBegin, InputIter srcEnd)
             : array(ext, srcBegin, srcEnd, accelerator(L"default").get_default_view()) {}

     template <typename InputIter>
         array(int e0, InputIter srcBegin)
             : array(hc::extent<N>(e0), srcBegin) {}
     template <typename InputIter>
         array(int e0, InputIter srcBegin, InputIter srcEnd)
             : array(hc::extent<N>(e0), srcBegin, srcEnd) {}
     template <typename InputIter>
         array(int e0, int e1, InputIter srcBegin)
             : array(hc::extent<N>(e0, e1), srcBegin) {}
     template <typename InputIter>
         array(int e0, int e1, InputIter srcBegin, InputIter srcEnd)
             : array(hc::extent<N>(e0, e1), srcBegin, srcEnd) {}
     template <typename InputIter>
         array(int e0, int e1, int e2, InputIter srcBegin)
             : array(hc::extent<N>(e0, e1, e2), srcBegin) {}
     template <typename InputIter>
         array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd)
             : array(hc::extent<N>(e0, e1, e2), srcBegin, srcEnd) {}

     explicit array(const array_view<const T, N>& src)
         : array(src.get_extent(), accelerator(L"default").get_default_view())
     { copy(src, *this); }

     array(const extent<N>& ext, accelerator_view av, access_type cpu_access_type = access_type_auto)
 #if __KALMAR_ACCELERATOR__ == 1
         : m_device(ext.size()), extent(ext) {}
 #else
         : m_device(av.pQueue, av.pQueue, check(ext).size(), cpu_access_type), extent(ext) {}
 #endif

     explicit array(int e0, void* accelerator_pointer)
         : array(hc::extent<N>(e0), accelerator(L"default").get_default_view(), accelerator_pointer) {}
     explicit array(int e0, int e1, void* accelerator_pointer)
         : array(hc::extent<N>(e0, e1), accelerator(L"default").get_default_view(), accelerator_pointer) {}
     explicit array(int e0, int e1, int e2, void* accelerator_pointer)
         : array(hc::extent<N>(e0, e1, e2), accelerator(L"default").get_default_view(), accelerator_pointer) {}

     explicit array(const extent<N>& ext, void* accelerator_pointer)
         : array(ext, accelerator(L"default").get_default_view(), accelerator_pointer) {}
     explicit array(const extent<N>& ext, accelerator_view av, void* accelerator_pointer, access_type cpu_access_type = access_type_auto)
 #if __KALMAR_ACCELERATOR__ == 1
         : m_device(ext.size(), accelerator_pointer), extent(ext) {}
 #else
         : m_device(av.pQueue, av.pQueue, check(ext).size(), accelerator_pointer, cpu_access_type), extent(ext) {}
 #endif

     array(int e0, accelerator_view av, access_type cpu_access_type = access_type_auto)
         : array(hc::extent<N>(e0), av, cpu_access_type) {}
     array(int e0, int e1, accelerator_view av, access_type cpu_access_type = access_type_auto)
         : array(hc::extent<N>(e0, e1), av, cpu_access_type) {}
     array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type = access_type_auto)
         : array(hc::extent<N>(e0, e1, e2), av, cpu_access_type) {}

     template <typename InputIter>
         array(const extent<N>& ext, InputIter srcBegin, accelerator_view av,
               access_type cpu_access_type = access_type_auto)
         : array(ext, av, cpu_access_type) { copy(srcBegin, *this); }
     template <typename InputIter>
         array(const extent<N>& ext, InputIter srcBegin, InputIter srcEnd,
               accelerator_view av, access_type cpu_access_type = access_type_auto)
         : array(ext, av, cpu_access_type) {
             if (ext.size() < std::distance(srcBegin, srcEnd))
                 throw runtime_exception("errorMsg_throw", 0);
             copy(srcBegin, srcEnd, *this);
         }

     array(const array_view<const T, N>& src, accelerator_view av, access_type cpu_access_type = access_type_auto)
         : array(src.get_extent(), av, cpu_access_type) { copy(src, *this); }

     template <typename InputIter>
         array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto)
             : array(extent<N>(e0), srcBegin, av, cpu_access_type) {}
     template <typename InputIter>
         array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto)
             : array(extent<N>(e0), srcBegin, srcEnd, av, cpu_access_type) {}
     template <typename InputIter>
         array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto)
             : array(hc::extent<N>(e0, e1), srcBegin, av, cpu_access_type) {}
     template <typename InputIter>
         array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto)
             : array(hc::extent<N>(e0, e1), srcBegin, srcEnd, av, cpu_access_type) {}
     template <typename InputIter>
         array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto)
             : array(hc::extent<N>(e0, e1, e2), srcBegin, av, cpu_access_type) {}
     template <typename InputIter>
         array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto)
             : array(hc::extent<N>(e0, e1, e2), srcBegin, srcEnd, av, cpu_access_type) {}

     array(const extent<N>& ext, accelerator_view av, accelerator_view associated_av)
 #if __KALMAR_ACCELERATOR__ == 1
         : m_device(ext.size()), extent(ext) {}
 #else
         : m_device(av.pQueue, associated_av.pQueue, check(ext).size(), access_type_auto), extent(ext) {}
 #endif

     array(int e0, accelerator_view av, accelerator_view associated_av)
         : array(hc::extent<N>(e0), av, associated_av) {}
     array(int e0, int e1, accelerator_view av, accelerator_view associated_av)
         : array(hc::extent<N>(e0, e1), av, associated_av) {}
     array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av)
         : array(hc::extent<N>(e0, e1, e2), av, associated_av) {}

     template <typename InputIter>
         array(const extent<N>& ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
             : array(ext, av, associated_av) { copy(srcBegin, *this); }
     template <typename InputIter>
         array(const extent<N>& ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
             : array(ext, av, associated_av) {
             if (ext.size() < std::distance(srcBegin, srcEnd))
                 throw runtime_exception("errorMsg_throw", 0);
             copy(srcBegin, srcEnd, *this);
         }

     array(const array_view<const T, N>& src, accelerator_view av, accelerator_view associated_av)
         : array(src.get_extent(), av, associated_av)
     { copy(src, *this); }

     template <typename InputIter>
         array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
             : array(extent<N>(e0), srcBegin, av, associated_av) {}
     template <typename InputIter>
         array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
             : array(extent<N>(e0), srcBegin, srcEnd, av, associated_av) {}
     template <typename InputIter>
         array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
             : array(hc::extent<N>(e0, e1), srcBegin, av, associated_av) {}
     template <typename InputIter>
         array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
             : array(hc::extent<N>(e0, e1), srcBegin, srcEnd, av, associated_av) {}
     template <typename InputIter>
         array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
             : array(hc::extent<N>(e0, e1, e2), srcBegin, av, associated_av) {}
     template <typename InputIter>
         array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
             : array(hc::extent<N>(e0, e1, e2), srcBegin, srcEnd, av, associated_av) {}

     extent<N> get_extent() const __CPU__ __HC__ { return extent; }

     accelerator_view get_accelerator_view() const { return m_device.get_av(); }

     accelerator_view get_associated_accelerator_view() const { return m_device.get_stage(); }

     access_type get_cpu_access_type() const { return m_device.get_access(); }

     array& operator=(const array& other) {
         if (this != &other) {
             array arr(other);
             *this = std::move(arr);
         }
         return *this;
     }

     array& operator=(array&& other) {
         if (this != &other) {
             extent = other.extent;
             m_device = other.m_device;
             other.m_device.reset();
         }
         return *this;
     }

     array& operator=(const array_view<T,N>& src) {
         array arr(src);
         *this = std::move(arr);
         return *this;
     }

     void copy_to(array& dest) const {
 #if __KALMAR_ACCELERATOR__ != 1
         for(int i = 0 ; i < N ; i++)
         {
             if (dest.extent[i] < this->extent[i] )
                 throw runtime_exception("errorMsg_throw", 0);
         }
 #endif
         copy(*this, dest);
     }

     void copy_to(const array_view<T,N>& dest) const { copy(*this, dest); }

     T* data() const __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         if (!m_device.get())
             return nullptr;
         m_device.synchronize(true);
 #endif
         return reinterpret_cast<T*>(m_device.get());
     }

     T* accelerator_pointer() const __CPU__ __HC__ {
         return reinterpret_cast<T*>(m_device.get_device_pointer());
     }

     operator std::vector<T>() const {
         std::vector<T> vec(extent.size());
         copy(*this, vec.data());
         return std::move(vec);
     }

     T& operator[](const index<N>& idx) __CPU__ __HC__ {
 #ifndef __KALMAR_ACCELERATOR__
         if (!m_device.get())
             throw runtime_exception("The array is not accessible on CPU.", 0);
         m_device.synchronize(true);
 #endif
         T *ptr = reinterpret_cast<T*>(m_device.get());
         return ptr[Kalmar::amp_helper<N, index<N>, hc::extent<N>>::flatten(idx, extent)];
     }
     T& operator()(const index<N>& idx) __CPU__ __HC__ {
         return (*this)[idx];
     }

     const T& operator[](const index<N>& idx) const __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         if (!m_device.get())
             throw runtime_exception("The array is not accessible on CPU.", 0);
         m_device.synchronize();
 #endif
         T *ptr = reinterpret_cast<T*>(m_device.get());
         return ptr[Kalmar::amp_helper<N, index<N>, hc::extent<N>>::flatten(idx, extent)];
     }
     const T& operator()(const index<N>& idx) const __CPU__ __HC__ {
         return (*this)[idx];
     }

     T& operator()(int i0, int i1) __CPU__ __HC__ {
         return (*this)[index<2>(i0, i1)];
     }
     T& operator()(int i0, int i1, int i2) __CPU__ __HC__ {
         return (*this)[index<3>(i0, i1, i2)];
     }

     const T& operator()(int i0, int i1) const __CPU__ __HC__ {
         return (*this)[index<2>(i0, i1)];
     }
     const T& operator()(int i0, int i1, int i2) const __CPU__ __HC__ {
         return (*this)[index<3>(i0, i1, i2)];
     }

     typename array_projection_helper<T, N>::result_type
         operator[] (int i) __CPU__ __HC__ {
             return array_projection_helper<T, N>::project(*this, i);
         }
     typename array_projection_helper<T, N>::result_type
         operator()(int i0) __CPU__ __HC__ {
             return (*this)[i0];
         }
     typename array_projection_helper<T, N>::const_result_type
         operator[] (int i) const __CPU__ __HC__ {
             return array_projection_helper<T, N>::project(*this, i);
         }
     typename array_projection_helper<T, N>::const_result_type
         operator()(int i0) const __CPU__ __HC__ {
             return (*this)[i0];
         }

     array_view<T, N> section(const index<N>& origin, const extent<N>& ext) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         if ( !Kalmar::amp_helper<N, index<N>, hc::extent<N>>::contains(origin,  ext ,this->extent) )
             throw runtime_exception("errorMsg_throw", 0);
 #endif
         array_view<T, N> av(*this);
         return av.section(origin, ext);
     }
     array_view<const T, N> section(const index<N>& origin, const extent<N>& ext) const __CPU__ __HC__ {
         array_view<const T, N> av(*this);
         return av.section(origin, ext);
     }

     array_view<T, N> section(const index<N>& idx) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         if ( !Kalmar::amp_helper<N, index<N>, hc::extent<N>>::contains(idx, this->extent ) )
             throw runtime_exception("errorMsg_throw", 0);
 #endif
         array_view<T, N> av(*this);
         return av.section(idx);
     }
     array_view<const T, N> section(const index<N>& idx) const __CPU__ __HC__ {
         array_view<const T, N> av(*this);
         return av.section(idx);
     }

     array_view<T,N> section(const extent<N>& ext) __CPU__ __HC__ {
         array_view<T, N> av(*this);
         return av.section(ext);
     }
     array_view<const T,N> section(const extent<N>& ext) const __CPU__ __HC__ {
         array_view<const T, N> av(*this);
         return av.section(ext);
     }

     array_view<T, 1> section(int i0, int e0) __CPU__ __HC__ {
         static_assert(N == 1, "Rank must be 1");
         return section(index<1>(i0), hc::extent<1>(e0));
     }
     array_view<const T, 1> section(int i0, int e0) const __CPU__ __HC__ {
         static_assert(N == 1, "Rank must be 1");
         return section(index<1>(i0), hc::extent<1>(e0));
     }
     array_view<T, 2> section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ {
         static_assert(N == 2, "Rank must be 2");
         return section(index<2>(i0, i1), hc::extent<2>(e0, e1));
     }
     array_view<T, 2> section(int i0, int i1, int e0, int e1) __CPU__ __HC__ {
         static_assert(N == 2, "Rank must be 2");
         return section(index<2>(i0, i1), hc::extent<2>(e0, e1));
     }
     array_view<T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) __CPU__ __HC__ {
         static_assert(N == 3, "Rank must be 3");
         return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2));
     }
     array_view<const T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ {
         static_assert(N == 3, "Rank must be 3");
         return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2));
     }

     template <typename ElementType>
         array_view<ElementType, 1> reinterpret_as() __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
             static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
             static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
             if( (extent.size() * sizeof(T)) % sizeof(ElementType))
                 throw runtime_exception("errorMsg_throw", 0);
 #endif
             int size = extent.size() * sizeof(T) / sizeof(ElementType);
             using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
             array_view<ElementType, 1> av(buffer_type(m_device), extent<1>(size), 0);
             return av;
         }
     template <typename ElementType>
         array_view<const ElementType, 1> reinterpret_as() const __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
             static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
             static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
 #endif
             int size = extent.size() * sizeof(T) / sizeof(ElementType);
             using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
             array_view<const ElementType, 1> av(buffer_type(m_device), extent<1>(size), 0);
             return av;
         }

     template <int K> array_view<T, K>
         view_as(const extent<K>& viewExtent) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
             if( viewExtent.size() > extent.size())
                 throw runtime_exception("errorMsg_throw", 0);
 #endif
             array_view<T, K> av(m_device, viewExtent, 0);
             return av;
         }
     template <int K> array_view<const T, K>
         view_as(const extent<K>& viewExtent) const __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
             if( viewExtent.size() > extent.size())
                 throw runtime_exception("errorMsg_throw", 0);
 #endif
             const array_view<T, K> av(m_device, viewExtent, 0);
             return av;
         }

     ~array() {}

     // FIXME: functions below may be considered to move to private
     const acc_buffer_t& internal() const __CPU__ __HC__ { return m_device; }
     int get_offset() const __CPU__ __HC__ { return 0; }
     index<N> get_index_base() const __CPU__ __HC__ { return index<N>(); }
 private:
     template <typename K, int Q> friend struct projection_helper;
     template <typename K, int Q> friend struct array_projection_helper;
     acc_buffer_t m_device;
     extent<N> extent;

     template <typename Q, int K> friend
         void copy(const array<Q, K>&, const array_view<Q, K>&);
     template <typename Q, int K> friend
         void copy(const array_view<const Q, K>&, array<Q, K>&);
 };

 // ------------------------------------------------------------------------
 // array_view
 // ------------------------------------------------------------------------

 template <typename T, int N = 1>
 class array_view
 {
 public:
     typedef typename std::remove_const<T>::type nc_T;
 #if __KALMAR_ACCELERATOR__ == 1
     typedef Kalmar::_data<T> acc_buffer_t;
 #else
     typedef Kalmar::_data_host<T> acc_buffer_t;
 #endif

     static const int rank = N;

     typedef T value_type;

     array_view() = delete;

     array_view(array<T, N>& src) __CPU__ __HC__
         : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {}

     // FIXME: following interfaces were not implemented yet
     // template <typename Container>
     //     explicit array_view<T, 1>::array_view(Container& src);
     // template <typename value_type, int Size>
     //     explicit array_view<T, 1>::array_view(value_type (&src) [Size]) __CPU__ __HC__;

     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
         array_view(const extent<N>& extent, Container& src)
             : array_view(extent, src.data())
         { static_assert( std::is_same<decltype(src.data()), T*>::value, "container element type and array view element type must match"); }

     array_view(const extent<N>& ext, value_type* src) __CPU__ __HC__
 #if __KALMAR_ACCELERATOR__ == 1
         : cache((T *)(src)), extent(ext), extent_base(ext), offset(0) {}
 #else
         : cache(ext.size(), (T *)(src)), extent(ext), extent_base(ext), offset(0) {}
 #endif

     explicit array_view(const extent<N>& ext)
         : cache(ext.size()), extent(ext), extent_base(ext), offset(0) {}

     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
         array_view(int e0, Container& src)
             : array_view(hc::extent<N>(e0), src) {}
     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
         array_view(int e0, int e1, Container& src)
             : array_view(hc::extent<N>(e0, e1), src) {}
     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
         array_view(int e0, int e1, int e2, Container& src)
             : array_view(hc::extent<N>(e0, e1, e2), src) {}

     array_view(int e0, value_type *src) __CPU__ __HC__
         : array_view(hc::extent<N>(e0), src) {}
     array_view(int e0, int e1, value_type *src) __CPU__ __HC__
         : array_view(hc::extent<N>(e0, e1), src) {}
     array_view(int e0, int e1, int e2, value_type *src) __CPU__ __HC__
         : array_view(hc::extent<N>(e0, e1, e2), src) {}

     explicit array_view(int e0) : array_view(hc::extent<N>(e0)) {}
     explicit array_view(int e0, int e1)
         : array_view(hc::extent<N>(e0, e1)) {}
     explicit array_view(int e0, int e1, int e2)
         : array_view(hc::extent<N>(e0, e1, e2)) {}

     array_view(const array_view& other) __CPU__ __HC__
         : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}

     extent<N> get_extent() const __CPU__ __HC__ { return extent; }

     accelerator_view get_source_accelerator_view() const { return cache.get_av(); }

     array_view& operator=(const array_view& other) __CPU__ __HC__ {
         if (this != &other) {
             cache = other.cache;
             extent = other.extent;
             index_base = other.index_base;
             extent_base = other.extent_base;
             offset = other.offset;
         }
         return *this;
     }

     void copy_to(array<T,N>& dest) const {
 #if __KALMAR_ACCELERATOR__ != 1
         for(int i= 0 ;i< N;i++)
         {
           if (dest.get_extent()[i] < this->extent[i])
               throw runtime_exception("errorMsg_throw", 0);
         }
 #endif
         copy(*this, dest);
     }

     void copy_to(const array_view& dest) const { copy(*this, dest); }

     T* data() const __CPU__ __HC__ {

 #if __KALMAR_ACCELERATOR__ != 1
         cache.get_cpu_access(true);
 #endif
         static_assert(N == 1, "data() is only permissible on array views of rank 1");
         return reinterpret_cast<T*>(cache.get() + offset + index_base[0]);
     }

     T* accelerator_pointer() const __CPU__ __HC__ {
         return reinterpret_cast<T*>(cache.get_device_pointer() + offset + index_base[0]);
     }

     void refresh() const { cache.refresh(); }

     // FIXME: type parameter is not implemented
     void synchronize() const { cache.get_cpu_access(); }

     // FIXME: type parameter is not implemented
     completion_future synchronize_async() const {
         std::future<void> fut = std::async([&]() mutable { synchronize(); });
         return completion_future(fut.share());
     }

     // FIXME: type parameter is not implemented
     void synchronize_to(const accelerator_view& av) const {
 #if __KALMAR_ACCELERATOR__ != 1
         cache.sync_to(av.pQueue);
 #endif
     }

     // FIXME: this method is not implemented yet
     completion_future synchronize_to_async(const accelerator_view& av) const;

     void discard_data() const {
 #if __KALMAR_ACCELERATOR__ != 1
         cache.discard();
 #endif
     }

     T& operator[] (const index<N>& idx) const __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         cache.get_cpu_access(true);
 #endif
         T *ptr = reinterpret_cast<T*>(cache.get() + offset);
         return ptr[Kalmar::amp_helper<N, index<N>, hc::extent<N>>::flatten(idx + index_base, extent_base)];
     }

     T& operator()(const index<N>& idx) const __CPU__ __HC__ {
         return (*this)[idx];
     }

     // FIXME: this method is not implemented
     T& get_ref(const index<N>& idx) const __CPU__ __HC__;

     T& operator() (int i0, int i1) const __CPU__ __HC__ {
         static_assert(N == 2, "T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
         return (*this)[index<2>(i0, i1)];
     }
     T& operator() (int i0, int i1, int i2) const __CPU__ __HC__ {
         static_assert(N == 3, "T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
         return (*this)[index<3>(i0, i1, i2)];
     }

     typename projection_helper<T, N>::result_type
         operator[] (int i) const __CPU__ __HC__ {
             return projection_helper<T, N>::project(*this, i);
         }
     typename projection_helper<T, N>::result_type
         operator() (int i0) const __CPU__ __HC__ { return (*this)[i0]; }

     array_view<T, N> section(const index<N>& idx,
                              const extent<N>& ext) const __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         if ( !Kalmar::amp_helper<N, index<N>, hc::extent<N>>::contains(idx, ext,this->extent ) )
             throw runtime_exception("errorMsg_throw", 0);
 #endif
         array_view<T, N> av(cache, ext, extent_base, idx + index_base, offset);
         return av;
     }

     array_view<T, N> section(const index<N>& idx) const __CPU__ __HC__ {
         hc::extent<N> ext(extent);
         Kalmar::amp_helper<N, index<N>, hc::extent<N>>::minus(idx, ext);
         return section(idx, ext);
     }

     array_view<T, N> section(const extent<N>& ext) const __CPU__ __HC__ {
         index<N> idx;
         return section(idx, ext);
     }

     array_view<T, 1> section(int i0, int e0) const __CPU__ __HC__ {
         static_assert(N == 1, "Rank must be 1");
         return section(index<1>(i0), hc::extent<1>(e0));
     }

     array_view<T, 2> section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ {
         static_assert(N == 2, "Rank must be 2");
         return section(index<2>(i0, i1), hc::extent<2>(e0, e1));
     }

     array_view<T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ {
         static_assert(N == 3, "Rank must be 3");
         return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2));
     }

     template <typename ElementType>
         array_view<ElementType, N> reinterpret_as() const __CPU__ __HC__ {
             static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1");
 #if __KALMAR_ACCELERATOR__ != 1
             static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
             static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
             if ( (extent.size() * sizeof(T)) % sizeof(ElementType))
                 throw runtime_exception("errorMsg_throw", 0);
 #endif
             int size = extent.size() * sizeof(T) / sizeof(ElementType);
             using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
             array_view<ElementType, 1> av(buffer_type(cache),
                                           extent<1>(size),
                                           (offset + index_base[0])* sizeof(T) / sizeof(ElementType));
             return av;
         }

     template <int K>
         array_view<T, K> view_as(extent<K> viewExtent) const __CPU__ __HC__ {
             static_assert(N == 1, "view_as is only permissible on array views of rank 1");
 #if __KALMAR_ACCELERATOR__ != 1
             if ( viewExtent.size() > extent.size())
                 throw runtime_exception("errorMsg_throw", 0);
 #endif
             array_view<T, K> av(cache, viewExtent, offset + index_base[0]);
             return av;
         }

     ~array_view() __CPU__ __HC__ {}

     // FIXME: the following functions could be considered to move to private
     const acc_buffer_t& internal() const __CPU__ __HC__ { return cache; }

     int get_offset() const __CPU__ __HC__ { return offset; }

     index<N> get_index_base() const __CPU__ __HC__ { return index_base; }

 private:
     template <typename K, int Q> friend struct projection_helper;
     template <typename K, int Q> friend struct array_projection_helper;
     template <typename Q, int K> friend class array;
     template <typename Q, int K> friend class array_view;

     template<typename Q, int K> friend
         bool is_flat(const array_view<Q, K>&) noexcept;
     template <typename Q, int K> friend
         void copy(const array<Q, K>&, const array_view<Q, K>&);
     template <typename InputIter, typename Q, int K> friend
         void copy(InputIter, InputIter, const array_view<Q, K>&);
     template <typename Q, int K> friend
         void copy(const array_view<const Q, K>&, array<Q, K>&);
     template <typename OutputIter, typename Q, int K> friend
         void copy(const array_view<Q, K>&, OutputIter);
     template <typename Q, int K> friend
         void copy(const array_view<const Q, K>& src, const array_view<Q, K>& dest);

     // used by view_as and reinterpret_as
     array_view(const acc_buffer_t& cache, const hc::extent<N>& ext,
                int offset) __CPU__ __HC__
         : cache(cache), extent(ext), extent_base(ext), offset(offset) {}

     // used by section and projection
     array_view(const acc_buffer_t& cache, const hc::extent<N>& ext_now,
                const hc::extent<N>& ext_b,
                const index<N>& idx_b, int off) __CPU__ __HC__
         : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b),
         offset(off) {}

     acc_buffer_t cache;
     hc::extent<N> extent;
     hc::extent<N> extent_base;
     index<N> index_base;
     int offset;
 };

 // ------------------------------------------------------------------------
 // array_view (read-only)
 // ------------------------------------------------------------------------

 template <typename T, int N>
 class array_view<const T, N>
 {
 public:
     typedef typename std::remove_const<T>::type nc_T;

 #if __KALMAR_ACCELERATOR__ == 1
   typedef Kalmar::_data<nc_T> acc_buffer_t;
 #else
   typedef Kalmar::_data_host<const T> acc_buffer_t;
 #endif

     static const int rank = N;

     typedef const T value_type;

     array_view() = delete;

     array_view(const array<T,N>& src) __CPU__ __HC__
         : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {}

     // FIXME: following interfaces were not implemented yet
     // template <typename Container>
     //     explicit array_view<const T, 1>::array_view(const Container& src);
     // template <typename value_type, int Size>
     //     explicit array_view<const T, 1>::array_view(const value_type (&src) [Size]) __CPU__ __HC__;

     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
         array_view(const extent<N>& extent, const Container& src)
             : array_view(extent, src.data())
         { static_assert( std::is_same<typename std::remove_const<typename std::remove_reference<decltype(*src.data())>::type>::type, T>::value, "container element type and array view element type must match"); }

     array_view(const extent<N>& ext, const value_type* src) __CPU__ __HC__
 #if __KALMAR_ACCELERATOR__ == 1
         : cache((nc_T*)(src)), extent(ext), extent_base(ext), offset(0) {}
 #else
         : cache(ext.size(), src), extent(ext), extent_base(ext), offset(0) {}
 #endif

     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
         array_view(int e0, Container& src) : array_view(hc::extent<1>(e0), src) {}
     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
         array_view(int e0, int e1, Container& src)
             : array_view(hc::extent<N>(e0, e1), src) {}
     template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
         array_view(int e0, int e1, int e2, Container& src)
             : array_view(hc::extent<N>(e0, e1, e2), src) {}

     array_view(int e0, const value_type *src) __CPU__ __HC__
         : array_view(hc::extent<1>(e0), src) {}
     array_view(int e0, int e1, const value_type *src) __CPU__ __HC__
         : array_view(hc::extent<2>(e0, e1), src) {}
     array_view(int e0, int e1, int e2, const value_type *src) __CPU__ __HC__
         : array_view(hc::extent<3>(e0, e1, e2), src) {}

     array_view(const array_view<nc_T, N>& other) __CPU__ __HC__
         : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}

     array_view(const array_view& other) __CPU__ __HC__
         : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}

     extent<N> get_extent() const __CPU__ __HC__ { return extent; }

     accelerator_view get_source_accelerator_view() const { return cache.get_av(); }

     array_view& operator=(const array_view<T,N>& other) __CPU__ __HC__ {
         cache = other.cache;
         extent = other.extent;
         index_base = other.index_base;
         extent_base = other.extent_base;
         offset = other.offset;
         return *this;
     }

     array_view& operator=(const array_view& other) __CPU__ __HC__ {
         if (this != &other) {
             cache = other.cache;
             extent = other.extent;
             index_base = other.index_base;
             extent_base = other.extent_base;
             offset = other.offset;
         }
         return *this;
     }

     void copy_to(array<T,N>& dest) const { copy(*this, dest); }

     void copy_to(const array_view<T,N>& dest) const { copy(*this, dest); }

     const T* data() const __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         cache.get_cpu_access();
 #endif
         static_assert(N == 1, "data() is only permissible on array views of rank 1");
         return reinterpret_cast<const T*>(cache.get() + offset + index_base[0]);
     }

     T* accelerator_pointer() const __CPU__ __HC__ {
         return reinterpret_cast<const T*>(cache.get_device_pointer() + offset + index_base[0]);
     }

     void refresh() const { cache.refresh(); }

     void synchronize() const { cache.get_cpu_access(); }

     completion_future synchronize_async() const {
         std::future<void> fut = std::async([&]() mutable { synchronize(); });
         return completion_future(fut.share());
     }

     void synchronize_to(const accelerator_view& av) const {
 #if __KALMAR_ACCELERATOR__ != 1
         cache.sync_to(av.pQueue);
 #endif
     }

     // FIXME: this method is not implemented yet
     completion_future synchronize_to_async(const accelerator_view& av) const;

     const T& operator[](const index<N>& idx) const __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
         cache.get_cpu_access();
 #endif
         const T *ptr = reinterpret_cast<const T*>(cache.get() + offset);
         return ptr[Kalmar::amp_helper<N, index<N>, hc::extent<N>>::flatten(idx + index_base, extent_base)];
     }
     const T& operator()(const index<N>& idx) const __CPU__ __HC__ {
         return (*this)[idx];
     }

     // FIXME: this method is not implemented
     const T& get_ref(const index<N>& idx) const __CPU__ __HC__;

     const T& operator()(int i0) const __CPU__ __HC__ {
         static_assert(N == 1, "const T& array_view::operator()(int) is only permissible on array_view<T, 1>");
         return (*this)[index<1>(i0)];
     }

     const T& operator()(int i0, int i1) const __CPU__ __HC__ {
         static_assert(N == 2, "const T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
         return (*this)[index<2>(i0, i1)];
     }
     const T& operator()(int i0, int i1, int i2) const __CPU__ __HC__ {
         static_assert(N == 3, "const T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
         return (*this)[index<3>(i0, i1, i2)];
     }

     typename projection_helper<const T, N>::const_result_type
         operator[] (int i) const __CPU__ __HC__ {
         return projection_helper<const T, N>::project(*this, i);
     }

     // FIXME: typename projection_helper<const T, N>::const_result_type
     //            operator() (int i0) const __CPU__ __HC__
     // is not implemented

     array_view<const T, N> section(const index<N>& idx,
                                    const extent<N>& ext) const __CPU__ __HC__ {
         array_view<const T, N> av(cache, ext, extent_base, idx + index_base, offset);
         return av;
     }

     array_view<const T, N> section(const index<N>& idx) const __CPU__ __HC__ {
         hc::extent<N> ext(extent);
         Kalmar::amp_helper<N, index<N>, hc::extent<N>>::minus(idx, ext);
         return section(idx, ext);
     }

     array_view<const T, N> section(const extent<N>& ext) const __CPU__ __HC__ {
         index<N> idx;
         return section(idx, ext);
     }

     array_view<const T, 1> section(int i0, int e0) const __CPU__ __HC__ {
         static_assert(N == 1, "Rank must be 1");
         return section(index<1>(i0), hc::extent<1>(e0));
     }

     array_view<const T, 2> section(int i0, int i1, int e0, int e1) const __CPU__ __HC__ {
         static_assert(N == 2, "Rank must be 2");
         return section(index<2>(i0, i1), hc::extent<2>(e0, e1));
     }

     array_view<const T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__ {
         static_assert(N == 3, "Rank must be 3");
         return section(index<3>(i0, i1, i2), hc::extent<3>(e0, e1, e2));
     }

     template <typename ElementType>
         array_view<const ElementType, N> reinterpret_as() const __CPU__ __HC__ {
             static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1");
 #if __KALMAR_ACCELERATOR__ != 1
             static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
             static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
 #endif
             int size = extent.size() * sizeof(T) / sizeof(ElementType);
             using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
             array_view<const ElementType, 1> av(buffer_type(cache),
                                                 extent<1>(size),
                                                 (offset + index_base[0])* sizeof(T) / sizeof(ElementType));
             return av;
         }

     template <int K>
         array_view<const T, K> view_as(extent<K> viewExtent) const __CPU__ __HC__ {
             static_assert(N == 1, "view_as is only permissible on array views of rank 1");
 #if __KALMAR_ACCELERATOR__ != 1
             if ( viewExtent.size() > extent.size())
                 throw runtime_exception("errorMsg_throw", 0);
 #endif
             array_view<const T, K> av(cache, viewExtent, offset + index_base[0]);
             return av;
         }

     ~array_view() __CPU__ __HC__ {}

     // FIXME: the following functions may be considered to move to private
     const acc_buffer_t& internal() const __CPU__ __HC__ { return cache; }

     int get_offset() const __CPU__ __HC__ { return offset; }

     index<N> get_index_base() const __CPU__ __HC__ { return index_base; }

 private:
     template <typename K, int Q> friend struct projection_helper;
     template <typename K, int Q> friend struct array_projection_helper;
     template <typename Q, int K> friend class array;
     template <typename Q, int K> friend class array_view;

     template<typename Q, int K> friend
         bool is_flat(const array_view<Q, K>&) noexcept;
     template <typename Q, int K> friend
         void copy(const array<Q, K>&, const array_view<Q, K>&);
     template <typename InputIter, typename Q, int K>
         void copy(InputIter, InputIter, const array_view<Q, K>&);
     template <typename Q, int K> friend
         void copy(const array_view<const Q, K>&, array<Q, K>&);
     template <typename OutputIter, typename Q, int K> friend
         void copy(const array_view<Q, K>&, OutputIter);
     template <typename Q, int K> friend
         void copy(const array_view<const Q, K>& src, const array_view<Q, K>& dest);

     // used by view_as and reinterpret_as
     array_view(const acc_buffer_t& cache, const hc::extent<N>& ext,
                int offset) __CPU__ __HC__
         : cache(cache), extent(ext), extent_base(ext), offset(offset) {}

     // used by section and projection
     array_view(const acc_buffer_t& cache, const hc::extent<N>& ext_now,
                const extent<N>& ext_b,
                const index<N>& idx_b, int off) __CPU__ __HC__
         : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b),
         offset(off) {}

     acc_buffer_t cache;
     hc::extent<N> extent;
     hc::extent<N> extent_base;
     index<N> index_base;
     int offset;
 };

 // ------------------------------------------------------------------------
 // utility functions for copy
 // ------------------------------------------------------------------------

 template<typename T, int N>
 static inline bool is_flat(const array_view<T, N>& av) noexcept {
     return av.extent == av.extent_base && av.index_base == index<N>();
 }

 template<typename T>
 static inline bool is_flat(const array_view<T, 1>& av) noexcept { return true; }

 template <typename InputIter, typename T, int N, int dim>
 struct copy_input
 {
     void operator()(InputIter& It, T* ptr, const extent<N>& ext,
                     const extent<N>& base, const index<N>& idx)
     {
         size_t stride = 1;
         for (int i = dim; i < N; i++)
             stride *= base[i];
         ptr += stride * idx[dim - 1];
         for (int i = 0; i < ext[dim - 1]; i++) {
             copy_input<InputIter, T, N, dim + 1>()(It, ptr, ext, base, idx);
             ptr += stride;
         }
     }
 };

 template <typename InputIter, typename T, int N>
 struct copy_input<InputIter, T, N, N>
 {
     void operator()(InputIter& It, T* ptr, const extent<N>& ext,
                     const extent<N>& base, const index<N>& idx)
     {
         InputIter end = It;
         std::advance(end, ext[N - 1]);
         std::copy(It, end, ptr + idx[N - 1]);
         It = end;
     }
 };

 template <typename OutputIter, typename T, int N, int dim>
 struct copy_output
 {
     void operator()(const T* ptr, OutputIter& It, const extent<N>& ext,
                     const extent<N>& base, const index<N>& idx)
     {
         size_t stride = 1;
         for (int i = dim; i < N; i++)
             stride *= base[i];
         ptr += stride * idx[dim - 1];
         for (int i = 0; i < ext[dim - 1]; i++) {
             copy_output<OutputIter, T, N, dim + 1>()(ptr, It, ext, base, idx);
             ptr += stride;
         }
     }
 };

 template <typename OutputIter, typename T, int N>
 struct copy_output<OutputIter, T, N, N>
 {
     void operator()(const T* ptr, OutputIter& It, const extent<N>& ext,
                     const extent<N>& base, const index<N>& idx)
     {
         ptr += idx[N - 1];
         It = std::copy(ptr, ptr + ext[N - 1], It);
     }
 };

 template <typename T, int N, int dim>
 struct copy_bidir
 {
     void operator()(const T* src, T* dst, const extent<N>& ext,
                     const extent<N>& base1, const index<N>& idx1,
                     const extent<N>& base2, const index<N>& idx2)
     {
         size_t stride1 = 1;
         for (int i = dim; i < N; i++)
             stride1 *= base1[i];
         src += stride1 * idx1[dim - 1];

         size_t stride2 = 1;
         for (int i = dim; i < N; i++)
             stride2 *= base2[i];
         dst += stride2 * idx2[dim - 1];

         for (int i = 0; i < ext[dim - 1]; i++) {
             copy_bidir<T, N, dim + 1>()(src, dst, ext, base1, idx1, base2, idx2);
             src += stride1;
             dst += stride2;
         }
     }
 };

 template <typename T, int N>
 struct copy_bidir<T, N, N>
 {
     void operator()(const T* src, T* dst, const extent<N>& ext,
                     const extent<N>& base1, const index<N>& idx1,
                     const extent<N>& base2, const index<N>& idx2)
     {
         src += idx1[N - 1];
         dst += idx2[N - 1];
         std::copy(src, src + ext[N - 1], dst);
     }
 };

 template <typename Iter, typename T, int N>
 struct do_copy
 {
     template<template <typename, int> class _amp_container>
     void operator()(Iter srcBegin, Iter srcEnd, const _amp_container<T, N>& dest) {
         size_t size = dest.get_extent().size();
         size_t offset = dest.get_offset();
         bool modify = true;

         T* ptr = dest.internal().map_ptr(modify, size, offset);
          std::copy(srcBegin, srcEnd, ptr);
         dest.internal().unmap_ptr(ptr, modify, size, offset);
     }
     template<template <typename, int> class _amp_container>
     void operator()(const _amp_container<T, N> &src, Iter destBegin) {
         size_t size = src.get_extent().size();
         size_t offset = src.get_offset();
         bool modify = false;

         const T* ptr = src.internal().map_ptr(modify, size, offset);
         std::copy(ptr, ptr + src.get_extent().size(), destBegin);
         src.internal().unmap_ptr(ptr, modify, size, offset);
     }
 };

 template <typename Iter, typename T>
 struct do_copy<Iter, T, 1>
 {
     template<template <typename, int> class _amp_container>
     void operator()(Iter srcBegin, Iter srcEnd, const _amp_container<T, 1>& dest) {
         size_t size = dest.get_extent().size();
         size_t offset = dest.get_offset() + dest.get_index_base()[0];
         bool modify = true;

         T* ptr = dest.internal().map_ptr(modify, size, offset);
          std::copy(srcBegin, srcEnd, ptr);
         dest.internal().unmap_ptr(ptr, modify, size, offset);
     }
     template<template <typename, int> class _amp_container>
     void operator()(const _amp_container<T, 1> &src, Iter destBegin) {
         size_t size = src.get_extent().size();
         size_t offset = src.get_offset() + src.get_index_base()[0];
         bool modify = false;

         const T* ptr = src.internal().map_ptr(modify, size, offset);
         std::copy(ptr, ptr + src.get_extent().size(), destBegin);
         src.internal().unmap_ptr(ptr, modify, size, offset);
     }
 };

 template <typename T, int N>
 struct do_copy<T*, T, N>
 {
     template<template <typename, int> class _amp_container>
     void operator()(T* srcBegin, T* srcEnd, const _amp_container<T, N>& dest) {
         dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd), dest.get_offset(), true);
     }
     template<template <typename, int> class _amp_container>
     void operator()(const _amp_container<T, N> &src, T* destBegin) {
         src.internal().read(destBegin, src.get_extent().size(), src.get_offset());
     }
 };

 template <typename T>
 struct do_copy<T*, T, 1>
 {
     template<template <typename, int> class _amp_container>
     void operator()(const T* srcBegin, const T* srcEnd, const _amp_container<T, 1>& dest) {
         dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd),
                               dest.get_offset() + dest.get_index_base()[0], true);
     }
     template<template <typename, int> class _amp_container>
     void operator()(const _amp_container<T, 1> &src, T* destBegin) {
         src.internal().read(destBegin, src.get_extent().size(),
                             src.get_offset() + src.get_index_base()[0]);
     }
 };

 // ------------------------------------------------------------------------
 // copy
 // ------------------------------------------------------------------------

 template <typename T, int N>
 void copy(const array<T, N>& src, array<T, N>& dest) {
     src.internal().copy(dest.internal(), 0, 0, 0);
 }

 template <typename T, int N>
 void copy(const array<T, N>& src, const array_view<T, N>& dest) {
     if (is_flat(dest))
         src.internal().copy(dest.internal(), src.get_offset(),
                             dest.get_offset(), dest.get_extent().size());
     else {
         // FIXME: logic here deserve to be reviewed
         size_t srcSize = src.extent.size();
         size_t srcOffset = 0;
         bool srcModify = false;
         size_t destSize = dest.extent_base.size();
         size_t destOffset = dest.offset;
         bool destModify = true;

         T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
         T* p = pSrc;
         T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
         copy_input<T*, T, N, 1>()(pSrc, pDst, dest.extent, dest.extent_base, dest.index_base);
         dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
         src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
     }
 }

 template <typename T>
 void copy(const array<T, 1>& src, const array_view<T, 1>& dest) {
     src.internal().copy(dest.internal(),
                         src.get_offset() + src.get_index_base()[0],
                         dest.get_offset() + dest.get_index_base()[0],
                         dest.get_extent().size());
 }

 template <typename T, int N>
 void copy(const array_view<const T, N>& src, array<T, N>& dest) {
     if (is_flat(src)) {
         src.internal().copy(dest.internal(), src.get_offset(),
                             dest.get_offset(), dest.get_extent().size());
     } else {
         // FIXME: logic here deserve to be reviewed
         size_t srcSize = src.extent_base.size();
         size_t srcOffset = src.offset;
         bool srcModify = false;
         size_t destSize = dest.extent.size();
         size_t destOffset = 0;
         bool destModify = true;

         T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
         T* p = pDst;
         const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
         copy_output<T*, T, N, 1>()(pSrc, pDst, src.extent, src.extent_base, src.index_base);
         src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
         dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
     }
 }

 template <typename T, int N>
 void copy(const array_view<T, N>& src, array<T, N>& dest) {
     const array_view<const T, N> buf(src);
     copy(buf, dest);
 }

 template <typename T>
 void copy(const array_view<const T, 1>& src, array<T, 1>& dest) {
     src.internal().copy(dest.internal(),
                         src.get_offset() + src.get_index_base()[0],
                         dest.get_offset() + dest.get_index_base()[0],
                         dest.get_extent().size());
 }

 template <typename T, int N>
 void copy(const array_view<const T, N>& src, const array_view<T, N>& dest) {
     if (is_flat(src)) {
         if (is_flat(dest))
             src.internal().copy(dest.internal(), src.get_offset(),
                                 dest.get_offset(), dest.get_extent().size());
         else {
             // FIXME: logic here deserve to be reviewed
             size_t srcSize = src.extent.size();
             size_t srcOffset = 0;
             bool srcModify = false;
             size_t destSize = dest.extent_base.size();
             size_t destOffset = dest.offset;
             bool destModify = true;

             const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
             const T* p = pSrc;
             T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
             copy_input<const T*, T, N, 1>()(pSrc, pDst, dest.extent, dest.extent_base, dest.index_base);
             dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
             src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
         }
     } else {
         if (is_flat(dest)) {
             // FIXME: logic here deserve to be reviewed
             size_t srcSize = src.extent_base.size();
             size_t srcOffset = src.offset;
             bool srcModify = false;
             size_t destSize = dest.extent.size();
             size_t destOffset = 0;
             bool destModify = true;

             T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
             T* p = pDst;
             const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
             copy_output<T*, T, N, 1>()(pSrc, pDst, src.extent, src.extent_base, src.index_base);
             dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
             src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
         } else {
             // FIXME: logic here deserve to be reviewed
             size_t srcSize = src.extent_base.size();
             size_t srcOffset = src.offset;
             bool srcModify = false;
             size_t destSize = dest.extent_base.size();
             size_t destOffset = dest.offset;
             bool destModify = true;

             const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
             T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
             copy_bidir<T, N, 1>()(pSrc, pDst, src.extent, src.extent_base,
                                   src.index_base, dest.extent_base, dest.index_base);
             dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
             src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
         }
     }
 }

 template <typename T, int N>
 void copy(const array_view<T, N>& src, const array_view<T, N>& dest) {
     const array_view<const T, N> buf(src);
     copy(buf, dest);
 }

 template <typename T>
 void copy(const array_view<const T, 1>& src, const array_view<T, 1>& dest) {
     src.internal().copy(dest.internal(),
                         src.get_offset() + src.get_index_base()[0],
                         dest.get_offset() + dest.get_index_base()[0],
                         dest.get_extent().size());
 }

 template <typename InputIter, typename T, int N>
 void copy(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest) {
 #if __KALMAR_ACCELERATOR__ != 1
     if( ( std::distance(srcBegin,srcEnd) <=0 )||( std::distance(srcBegin,srcEnd) < dest.get_extent().size() ))
       throw runtime_exception("errorMsg_throw ,copy between different types", 0);
 #endif
     do_copy<InputIter, T, N>()(srcBegin, srcEnd, dest);
 }

 template <typename InputIter, typename T, int N>
 void copy(InputIter srcBegin, array<T, N>& dest) {
     InputIter srcEnd = srcBegin;
     std::advance(srcEnd, dest.get_extent().size());
     copy(srcBegin, srcEnd, dest);
 }

 template <typename InputIter, typename T, int N>
 void copy(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest) {
     if (is_flat(dest))
         do_copy<InputIter, T, N>()(srcBegin, srcEnd, dest);
     else {
         size_t size = dest.extent_base.size();
         size_t offset = dest.offset;
         bool modify = true;

         T* ptr = dest.internal().map_ptr(modify, size, offset);
         copy_input<InputIter, T, N, 1>()(srcBegin, ptr, dest.extent, dest.extent_base, dest.index_base);
         dest.internal().unmap_ptr(ptr, modify, size, offset);
     }
 }

 template <typename InputIter, typename T, int N>
 void copy(InputIter srcBegin, const array_view<T, N>& dest) {
     InputIter srcEnd = srcBegin;
     std::advance(srcEnd, dest.get_extent().size());
     copy(srcBegin, srcEnd, dest);
 }

 template <typename OutputIter, typename T, int N>
 void copy(const array<T, N> &src, OutputIter destBegin) {
     do_copy<OutputIter, T, N>()(src, destBegin);
 }

 template <typename OutputIter, typename T, int N>
 void copy(const array_view<T, N> &src, OutputIter destBegin) {
     if (is_flat(src))
         do_copy<OutputIter, T, N>()(src, destBegin);
     else {
         size_t size = src.extent_base.size();
         size_t offset = src.offset;
         bool modify = false;

         T* ptr = src.internal().map_ptr(modify, size, offset);
         copy_output<OutputIter, T, N, 1>()(ptr, destBegin, src.extent, src.extent_base, src.index_base);
         src.internal().unmap_ptr(ptr, modify, size, offset);
     }
 }

 // ------------------------------------------------------------------------
 // utility function for copy_async
 // ------------------------------------------------------------------------


 // ------------------------------------------------------------------------
 // copy_async
 // ------------------------------------------------------------------------

 template <typename T, int N>
 completion_future copy_async(const array<T, N>& src, array<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
     return completion_future(fut.share());
 }

 template <typename T, int N>
 completion_future copy_async(const array<T, N>& src, const array_view<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
     return completion_future(fut.share());
 }

 template <typename T, int N>
 completion_future copy_async(const array_view<const T, N>& src, array<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
     return completion_future(fut.share());
 }

 template <typename T, int N>
 completion_future copy_async(const array_view<T, N>& src, array<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
     return completion_future(fut.share());
 }

 template <typename T, int N>
 completion_future copy_async(const array_view<const T, N>& src, const array_view<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
     return completion_future(fut.share());
 }

 template <typename T, int N>
 completion_future copy_async(const array_view<T, N>& src, const array_view<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
     return completion_future(fut.share());
 }

 template <typename InputIter, typename T, int N>
 completion_future copy_async(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]() mutable { copy(srcBegin, srcEnd, dest); });
     return completion_future(fut.share());
 }

 template <typename InputIter, typename T, int N>
 completion_future copy_async(InputIter srcBegin, array<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]() mutable { copy(srcBegin, dest); });
     return completion_future(fut.share());
 }

 template <typename InputIter, typename T, int N>
 completion_future copy_async(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]() mutable { copy(srcBegin, srcEnd, dest); });
     return completion_future(fut.share());
 }

 template <typename InputIter, typename T, int N>
 completion_future copy_async(InputIter srcBegin, const array_view<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]() mutable { copy(srcBegin, dest); });
     return completion_future(fut.share());
 }

 template <typename OutputIter, typename T, int N>
 completion_future copy_async(const array<T, N>& src, OutputIter destBegin) {
     std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]() mutable { copy(src, destBegin); });
     return completion_future(fut.share());
 }

 template <typename OutputIter, typename T, int N>
 completion_future copy_async(const array_view<T, N>& src, OutputIter destBegin) {
     std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]() mutable { copy(src, destBegin); });
     return completion_future(fut.share());
 }


 // FIXME: consider remove these functions
 template <typename T, int N>
 completion_future copy_async(const array<T, N>& src, const array<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
     return completion_future(fut.share());
 }

 template <typename T, int N>
 completion_future copy_async(const array_view<const T, N>& src, const array<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
     return completion_future(fut.share());
 }

 template <typename T, int N>
 completion_future copy_async(const array_view<T, N>& src, const array<T, N>& dest) {
     std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
     return completion_future(fut.share());
 }

 // ------------------------------------------------------------------------
 // atomic functions
 // ------------------------------------------------------------------------

 #if __KALMAR_ACCELERATOR__ == 1
 extern "C" unsigned int atomic_exchange_unsigned(unsigned int *p, unsigned int val) __HC__;
 extern "C" int atomic_exchange_int(int *p, int val) __HC__;
 extern "C" float atomic_exchange_float(float *p, float val) __HC__;
 extern "C" uint64_t atomic_exchange_uint64(uint64_t *p, uint64_t val) __HC__;

 static inline unsigned int atomic_exchange(unsigned int * dest, unsigned int val) __CPU__ __HC__ {
   return atomic_exchange_unsigned(dest, val);
 }
 static inline int atomic_exchange(int * dest, int val) __CPU__ __HC__ {
   return atomic_exchange_int(dest, val);
 }
 static inline float atomic_exchange(float * dest, float val) __CPU__ __HC__ {
   return atomic_exchange_float(dest, val);
 }
 static inline uint64_t atomic_exchange(uint64_t * dest, uint64_t val) __CPU__ __HC__ {
   return atomic_exchange_uint64(dest, val);
 }
 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
 unsigned int atomic_exchange_unsigned(unsigned int *p, unsigned int val);
 int atomic_exchange_int(int *p, int val);
 float atomic_exchange_float(float *p, float val);
 uint64_t atomic_exchange_uint64(uint64_t *p, uint64_t val);

 static inline unsigned int atomic_exchange(unsigned int *dest, unsigned int val) __CPU__ __HC__ {
   return atomic_exchange_unsigned(dest, val);
 }
 static inline int atomic_exchange(int *dest, int val) __CPU__ __HC__ {
   return atomic_exchange_int(dest, val);
 }
 static inline float atomic_exchange(float *dest, float val) __CPU__ __HC__ {
   return atomic_exchange_float(dest, val);
 }
 static inline uint64_t atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__ {
   return atomic_exchange_uint64(dest, val);
 }
 #else
 extern unsigned int atomic_exchange(unsigned int *dest, unsigned int val) __CPU__ __HC__;
 extern int atomic_exchange(int *dest, int val) __CPU__ __HC__;
 extern float atomic_exchange(float *dest, float val) __CPU__ __HC__;
 extern uint64_t atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__;
 #endif

 #if __KALMAR_ACCELERATOR__ == 1
 extern "C" unsigned int atomic_compare_exchange_unsigned(unsigned int *dest, unsigned int expected_val, unsigned int val) __HC__;
 extern "C" int atomic_compare_exchange_int(int *dest, int expected_val, int val) __HC__;
 extern "C" uint64_t atomic_compare_exchange_uint64(uint64_t *dest, uint64_t expected_val, uint64_t val) __HC__;

 static inline bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) __CPU__ __HC__ {
   *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
   return (*dest == val);
 }
 static inline bool atomic_compare_exchange(int *dest, int *expected_val, int val) __CPU__ __HC__ {
   *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
   return (*dest == val);
 }
 static inline bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__ {
   *expected_val = atomic_compare_exchange_uint64(dest, *expected_val, val);
   return (*dest == val);
 }
 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
 unsigned int atomic_compare_exchange_unsigned(unsigned int *dest, unsigned int expected_val, unsigned int val);
 int atomic_compare_exchange_int(int *dest, int expected_val, int val);
 uint64_t atomic_compare_exchange_uint64(uint64_t *dest, uint64_t expected_val, uint64_t val);

 static inline bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) __CPU__ __HC__ {
   *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
   return (*dest == val);
 }
 static inline bool atomic_compare_exchange(int *dest, int *expected_val, int val) __CPU__ __HC__ {
   *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
   return (*dest == val);
 }
 static inline bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__ {
   *expected_val = atomic_compare_exchange_uint64(dest, *expected_val, val);
   return (*dest == val);
 }
 #else
 extern bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) __CPU__ __HC__;
 extern bool atomic_compare_exchange(int *dest, int *expected_val, int val) __CPU__ __HC__;
 extern bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__;
 #endif

 #if __KALMAR_ACCELERATOR__ == 1
 extern "C" unsigned int atomic_add_unsigned(unsigned int *p, unsigned int val) __HC__;
 extern "C" int atomic_add_int(int *p, int val) __HC__;
 extern "C" float atomic_add_float(float *p, float val) __HC__;
 extern "C" uint64_t atomic_add_uint64(uint64_t *p, uint64_t val) __HC__;

 static inline unsigned int atomic_fetch_add(unsigned int *x, unsigned int y) __CPU__ __HC__ {
   return atomic_add_unsigned(x, y);
 }
 static inline int atomic_fetch_add(int *x, int y) __CPU__ __HC__ {
   return atomic_add_int(x, y);
 }
 static inline float atomic_fetch_add(float *x, float y) __CPU__ __HC__ {
   return atomic_add_float(x, y);
 }
 static inline uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__ {
   return atomic_add_uint64(x, y);
 }

 extern "C" unsigned int atomic_sub_unsigned(unsigned int *p, unsigned int val) __HC__;
 extern "C" int atomic_sub_int(int *p, int val) __HC__;
 extern "C" float atomic_sub_float(float *p, float val) __HC__;

 static inline unsigned int atomic_fetch_sub(unsigned int *x, unsigned int y) __CPU__ __HC__ {
   return atomic_sub_unsigned(x, y);
 }
 static inline int atomic_fetch_sub(int *x, int y) __CPU__ __HC__ {
   return atomic_sub_int(x, y);
 }
 static inline int atomic_fetch_sub(float *x, float y) __CPU__ __HC__ {
   return atomic_sub_float(x, y);
 }

 extern "C" unsigned int atomic_and_unsigned(unsigned int *p, unsigned int val) __HC__;
 extern "C" int atomic_and_int(int *p, int val) __HC__;
 extern "C" uint64_t atomic_and_uint64(uint64_t *p, uint64_t val) __HC__;

 static inline unsigned int atomic_fetch_and(unsigned int *x, unsigned int y) __CPU__ __HC__ {
   return atomic_and_unsigned(x, y);
 }
 static inline int atomic_fetch_and(int *x, int y) __CPU__ __HC__ {
   return atomic_and_int(x, y);
 }
 static inline uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__ {
   return atomic_and_uint64(x, y);
 }

 extern "C" unsigned int atomic_or_unsigned(unsigned int *p, unsigned int val) __HC__;
 extern "C" int atomic_or_int(int *p, int val) __HC__;
 extern "C" uint64_t atomic_or_uint64(uint64_t *p, uint64_t val) __HC__;

 static inline unsigned int atomic_fetch_or(unsigned int *x, unsigned int y) __CPU__ __HC__ {
   return atomic_or_unsigned(x, y);
 }
 static inline int atomic_fetch_or(int *x, int y) __CPU__ __HC__ {
   return atomic_or_int(x, y);
 }
 static inline uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__ {
   return atomic_or_uint64(x, y);
 }

 extern "C" unsigned int atomic_xor_unsigned(unsigned int *p, unsigned int val) __HC__;
 extern "C" int atomic_xor_int(int *p, int val) __HC__;
 extern "C" uint64_t atomic_xor_uint64(uint64_t *p, uint64_t val) __HC__;

 static inline unsigned int atomic_fetch_xor(unsigned int *x, unsigned int y) __CPU__ __HC__ {
   return atomic_xor_unsigned(x, y);
 }
 static inline int atomic_fetch_xor(int *x, int y) __CPU__ __HC__ {
   return atomic_xor_int(x, y);
 }
 static inline uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__ {
   return atomic_xor_uint64(x, y);
 }
 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
 unsigned int atomic_add_unsigned(unsigned int *p, unsigned int val);
 int atomic_add_int(int *p, int val);
 float atomic_add_float(float *p, float val);
 uint64_t atomic_add_uint64(uint64_t *p, uint64_t val);

 static inline unsigned int atomic_fetch_add(unsigned int *x, unsigned int y) __CPU__ __HC__ {
   return atomic_add_unsigned(x, y);
 }
 static inline int atomic_fetch_add(int *x, int y) __CPU__ __HC__ {
   return atomic_add_int(x, y);
 }
 static inline float atomic_fetch_add(float *x, float y) __CPU__ __HC__ {
   return atomic_add_float(x, y);
 }
 static inline uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__ {
   return atomic_add_uint64(x, y);
 }

 unsigned int atomic_sub_unsigned(unsigned int *p, unsigned int val);
 int atomic_sub_int(int *p, int val);
 float atomic_sub_float(float *p, float val);

 static inline unsigned int atomic_fetch_sub(unsigned int *x, unsigned int y) __CPU__ __HC__ {
   return atomic_sub_unsigned(x, y);
 }
 static inline int atomic_fetch_sub(int *x, int y) __CPU__ __HC__ {
   return atomic_sub_int(x, y);
 }
 static inline float atomic_fetch_sub(float *x, float y) __CPU__ __HC__ {
   return atomic_sub_float(x, y);
 }

 unsigned int atomic_and_unsigned(unsigned int *p, unsigned int val);
 int atomic_and_int(int *p, int val);
 uint64_t atomic_and_uint64(uint64_t *p, uint64_t val);

 static inline unsigned int atomic_fetch_and(unsigned int *x, unsigned int y) __CPU__ __HC__ {
   return atomic_and_unsigned(x, y);
 }
 static inline int atomic_fetch_and(int *x, int y) __CPU__ __HC__ {
   return atomic_and_int(x, y);
 }
 static inline uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__ {
   return atomic_and_uint64(x, y);
 }

 unsigned int atomic_or_unsigned(unsigned int *p, unsigned int val);
 int atomic_or_int(int *p, int val);
 uint64_t atomic_or_uint64(uint64_t *p, uint64_t val);

 static inline unsigned int atomic_fetch_or(unsigned int *x, unsigned int y) __CPU__ __HC__ {
   return atomic_or_unsigned(x, y);
 }
 static inline int atomic_fetch_or(int *x, int y) __CPU__ __HC__ {
   return atomic_or_int(x, y);
 }
 static inline uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__ {
   return atomic_or_uint64(x, y);
 }

 unsigned int atomic_xor_unsigned(unsigned int *p, unsigned int val);
 int atomic_xor_int(int *p, int val);
 uint64_t atomic_xor_uint64(uint64_t *p, uint64_t val);

 static inline unsigned int atomic_fetch_xor(unsigned int *x, unsigned int y) __CPU__ __HC__ {
   return atomic_xor_unsigned(x, y);
 }
 static inline int atomic_fetch_xor(int *x, int y) __CPU__ __HC__ {
   return atomic_xor_int(x, y);
 }
 static inline uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__ {
   return atomic_xor_uint64(x, y);
 }
 #else
 extern unsigned atomic_fetch_add(unsigned *x, unsigned y) __CPU__ __HC__;
 extern int atomic_fetch_add(int *x, int y) __CPU__ __HC__;
 extern float atomic_fetch_add(float *x, float y) __CPU__ __HC__;
 extern uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__;

 extern unsigned atomic_fetch_sub(unsigned *x, unsigned y) __CPU__ __HC__;
 extern int atomic_fetch_sub(int *x, int y) __CPU__ __HC__;
 extern float atomic_fetch_sub(float *x, float y) __CPU__ __HC__;

 extern unsigned atomic_fetch_and(unsigned *x, unsigned y) __CPU__ __HC__;
 extern int atomic_fetch_and(int *x, int y) __CPU__ __HC__;
 extern uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__;

 extern unsigned atomic_fetch_or(unsigned *x, unsigned y) __CPU__ __HC__;
 extern int atomic_fetch_or(int *x, int y) __CPU__ __HC__;
 extern uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__;

 extern unsigned atomic_fetch_xor(unsigned *x, unsigned y) __CPU__ __HC__;
 extern int atomic_fetch_xor(int *x, int y) __CPU__ __HC__;
 extern uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__;
 #endif

 #if __KALMAR_ACCELERATOR__ == 1
 extern "C" unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val) __HC__;
 extern "C" int atomic_max_int(int *p, int val) __HC__;
 extern "C" uint64_t atomic_max_uint64(uint64_t *p, uint64_t val) __HC__;

 static inline unsigned int atomic_fetch_max(unsigned int *x, unsigned int y) __HC__ {
   return atomic_max_unsigned(x, y);
 }
 static inline int atomic_fetch_max(int *x, int y) __HC__ {
   return atomic_max_int(x, y);
 }
 static inline uint64_t atomic_fetch_max(uint64_t *x, uint64_t y) __HC__ {
   return atomic_max_uint64(x, y);
 }

 extern "C" unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val) __HC__;
 extern "C" int atomic_min_int(int *p, int val) __HC__;
 extern "C" uint64_t atomic_min_uint64(uint64_t *p, uint64_t val) __HC__;

 static inline unsigned int atomic_fetch_min(unsigned int *x, unsigned int y) __HC__ {
   return atomic_min_unsigned(x, y);
 }
 static inline int atomic_fetch_min(int *x, int y) __HC__ {
   return atomic_min_int(x, y);
 }
 static inline uint64_t atomic_fetch_min(uint64_t *x, uint64_t y) __HC__ {
   return atomic_min_uint64(x, y);
 }
 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
 unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val);
 int atomic_max_int(int *p, int val);
 uint64_t atomic_max_uint64(uint64_t *p, uint64_t val);

 static inline unsigned int atomic_fetch_max(unsigned int *x, unsigned int y) __HC__ {
   return atomic_max_unsigned(x, y);
 }
 static inline int atomic_fetch_max(int *x, int y) __HC__ {
   return atomic_max_int(x, y);
 }
 static inline uint64_t atomic_fetch_max(uint64_t *x, uint64_t y) __HC__ {
   return atomic_max_uint64(x, y);
 }

 unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val);
 int atomic_min_int(int *p, int val);
 uint64_t atomic_min_uint64(uint64_t *p, uint64_t val);

 static inline unsigned int atomic_fetch_min(unsigned int *x, unsigned int y) __HC__ {
   return atomic_min_unsigned(x, y);
 }
 static inline int atomic_fetch_min(int *x, int y) __HC__ {
   return atomic_min_int(x, y);
 }
 static inline uint64_t atomic_fetch_min(uint64_t *x, uint64_t y) __HC__ {
   return atomic_min_uint64(x, y);
 }
 #else
 extern int atomic_fetch_max(int * dest, int val) __CPU__ __HC__;
 extern unsigned int atomic_fetch_max(unsigned int * dest, unsigned int val) __CPU__ __HC__;
 extern uint64_t atomic_fetch_max(uint64_t * dest, uint64_t val) __CPU__ __HC__;

 extern int atomic_fetch_min(int * dest, int val) __CPU__ __HC__;
 extern unsigned int atomic_fetch_min(unsigned int * dest, unsigned int val) __CPU__ __HC__;
 extern uint64_t atomic_fetch_min(uint64_t * dest, uint64_t val) __CPU__ __HC__;
 #endif

 #if __KALMAR_ACCELERATOR__ == 1
 extern "C" unsigned int atomic_inc_unsigned(unsigned int *p) __HC__;
 extern "C" int atomic_inc_int(int *p) __HC__;

 static inline unsigned int atomic_fetch_inc(unsigned int *x) __CPU__ __HC__ {
   return atomic_inc_unsigned(x);
 }
 static inline int atomic_fetch_inc(int *x) __CPU__ __HC__ {
   return atomic_inc_int(x);
 }

 extern "C" unsigned int atomic_dec_unsigned(unsigned int *p) __HC__;
 extern "C" int atomic_dec_int(int *p) __HC__;

 static inline unsigned int atomic_fetch_dec(unsigned int *x) __CPU__ __HC__ {
   return atomic_dec_unsigned(x);
 }
 static inline int atomic_fetch_dec(int *x) __CPU__ __HC__ {
   return atomic_dec_int(x);
 }
 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
 unsigned int atomic_inc_unsigned(unsigned int *p);
 int atomic_inc_int(int *p);

 static inline unsigned int atomic_fetch_inc(unsigned int *x) __CPU__ __HC__ {
   return atomic_inc_unsigned(x);
 }
 static inline int atomic_fetch_inc(int *x) __CPU__ __HC__ {
   return atomic_inc_int(x);
 }

 unsigned int atomic_dec_unsigned(unsigned int *p);
 int atomic_dec_int(int *p);

 static inline unsigned int atomic_fetch_dec(unsigned int *x) __CPU__ __HC__ {
   return atomic_dec_unsigned(x);
 }
 static inline int atomic_fetch_dec(int *x) __CPU__ __HC__ {
   return atomic_dec_int(x);
 }
 #else
 extern int atomic_fetch_inc(int * _Dest) __CPU__ __HC__;
 extern unsigned int atomic_fetch_inc(unsigned int * _Dest) __CPU__ __HC__;

 extern int atomic_fetch_dec(int * _Dest) __CPU__ __HC__;
 extern unsigned int atomic_fetch_dec(unsigned int * _Dest) __CPU__ __HC__;
 #endif

 extern "C" unsigned int __atomic_wrapinc(unsigned int* address, unsigned int val) __HC__;

 extern "C" unsigned int __atomic_wrapdec(unsigned int* address, unsigned int val) __HC__;


 // ------------------------------------------------------------------------
 // parallel_for_each
 // ------------------------------------------------------------------------

 template <int N, typename Kernel>
 completion_future parallel_for_each(const accelerator_view&, const extent<N>&, const Kernel&);

 template <typename Kernel>
 completion_future parallel_for_each(const accelerator_view&, const tiled_extent<3>&, const Kernel&);

 template <typename Kernel>
 completion_future parallel_for_each(const accelerator_view&, const tiled_extent<2>&, const Kernel&);

 template <typename Kernel>
 completion_future parallel_for_each(const accelerator_view&, const tiled_extent<1>&, const Kernel&);

 template <int N, typename Kernel>
 completion_future parallel_for_each(const extent<N>& compute_domain, const Kernel& f) {
     return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
 }

 template <typename Kernel>
 completion_future parallel_for_each(const tiled_extent<3>& compute_domain, const Kernel& f) {
     return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
 }

 template <typename Kernel>
 completion_future parallel_for_each(const tiled_extent<2>& compute_domain, const Kernel& f) {
     return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
 }

 template <typename Kernel>
 completion_future parallel_for_each(const tiled_extent<1>& compute_domain, const Kernel& f) {
     return parallel_for_each(accelerator::get_auto_selection_view(), compute_domain, f);
 }

 template <int N, typename Kernel, typename _Tp>
 struct pfe_helper
 {
     static inline void call(Kernel& k, _Tp& idx) __CPU__ __HC__ {
         int i;
         for (i = 0; i < k.ext[N - 1]; ++i) {
             idx[N - 1] = i;
             pfe_helper<N - 1, Kernel, _Tp>::call(k, idx);
         }
     }
 };
 template <typename Kernel, typename _Tp>
 struct pfe_helper<0, Kernel, _Tp>
 {
     static inline void call(Kernel& k, _Tp& idx) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ == 1
         k.k(idx);
 #endif
     }
 };

 template <int N, typename Kernel>
 class pfe_wrapper
 {
 public:
     explicit pfe_wrapper(const extent<N>& other, const Kernel& f) __CPU__ __HC__
         : ext(other), k(f) {}
     void operator() (index<N> idx) __CPU__ __HC__ {
         pfe_helper<N - 3, pfe_wrapper<N, Kernel>, index<N>>::call(*this, idx);
     }
 private:
     const extent<N> ext;
     const Kernel k;
     template <int K, typename Ker, typename _Tp>
         friend struct pfe_helper;
 };

 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma clang diagnostic ignored "-Wunused-variable"
 //ND parallel_for_each, nontiled
 template <int N, typename Kernel>
 __attribute__((noinline,used)) completion_future parallel_for_each(
     const accelerator_view& av,
     const extent<N>& compute_domain, const Kernel& f) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
     for(int i = 0 ; i < N ; i++)
     {
       // silently return in case the any dimension of the extent is 0
       if (compute_domain[i] == 0)
         return completion_future();
       if (compute_domain[i] < 0)
         throw invalid_compute_domain("Extent is less than 0.");
       if (static_cast<size_t>(compute_domain[i]) > 4294967295L)
         throw invalid_compute_domain("Extent size too large.");
     }
     size_t ext[3] = {static_cast<size_t>(compute_domain[N - 1]),
         static_cast<size_t>(compute_domain[N - 2]),
         static_cast<size_t>(compute_domain[N - 3])};
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     if (is_cpu()) {
         return launch_cpu_task_async(av.pQueue, f, compute_domain);
     }
 #endif
     if (av.get_accelerator().get_device_path() == L"cpu") {
       throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
     }
     const pfe_wrapper<N, Kernel> _pf(compute_domain, f);
     return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<pfe_wrapper<N, Kernel>, 3>(av.pQueue, ext, NULL, _pf));
 #else
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     int* foo1 = reinterpret_cast<int*>(&Kernel::__cxxamp_trampoline);
 #endif
     auto bar = &pfe_wrapper<N, Kernel>::operator();
     auto qq = &index<N>::__cxxamp_opencl_index;
     int* foo = reinterpret_cast<int*>(&pfe_wrapper<N, Kernel>::__cxxamp_trampoline);
 #endif
 }
 #pragma clang diagnostic pop

 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma clang diagnostic ignored "-Wunused-variable"
 //1D parallel_for_each, nontiled
 template <typename Kernel>
 __attribute__((noinline,used)) completion_future parallel_for_each(
     const accelerator_view& av, const extent<1>& compute_domain, const Kernel& f) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
   // silently return in case the any dimension of the extent is 0
   if (compute_domain[0] == 0)
     return completion_future();
   if (compute_domain[0] < 0) {
     throw invalid_compute_domain("Extent is less than 0.");
   }
   if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
     throw invalid_compute_domain("Extent size too large.");
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     if (is_cpu()) {
         return launch_cpu_task_async(av.pQueue, f, compute_domain);
     }
 #endif
   size_t ext = compute_domain[0];
   if (av.get_accelerator().get_device_path() == L"cpu") {
     throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
   }
   return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 1>(av.pQueue, &ext, NULL, f));
 #else //if __KALMAR_ACCELERATOR__ != 1
   //to ensure functor has right operator() defined
   //this triggers the trampoline code being emitted
   auto foo = &Kernel::__cxxamp_trampoline;
   auto bar = &Kernel::operator();
 #endif
 }
 #pragma clang diagnostic pop

 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma clang diagnostic ignored "-Wunused-variable"
 //2D parallel_for_each, nontiled
 template <typename Kernel>
 __attribute__((noinline,used)) completion_future parallel_for_each(
     const accelerator_view& av, const extent<2>& compute_domain, const Kernel& f) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
   // silently return in case the any dimension of the extent is 0
   if (compute_domain[0] == 0 || compute_domain[1] == 0)
     return completion_future();
   if (compute_domain[0] < 0 || compute_domain[1] < 0) {
     throw invalid_compute_domain("Extent is less than 0.");
   }
   if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
     throw invalid_compute_domain("Extent size too large.");
   if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
     throw invalid_compute_domain("Extent size too large.");
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     if (is_cpu()) {
         return launch_cpu_task_async(av.pQueue, f, compute_domain);
     }
 #endif
   size_t ext[2] = {static_cast<size_t>(compute_domain[1]),
                    static_cast<size_t>(compute_domain[0])};
   if (av.get_accelerator().get_device_path() == L"cpu") {
     throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
   }
   return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 2>(av.pQueue, ext, NULL, f));
 #else //if __KALMAR_ACCELERATOR__ != 1
   //to ensure functor has right operator() defined
   //this triggers the trampoline code being emitted
   auto foo = &Kernel::__cxxamp_trampoline;
   auto bar = &Kernel::operator();
 #endif
 }
 #pragma clang diagnostic pop

 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma clang diagnostic ignored "-Wunused-variable"
 //3D parallel_for_each, nontiled
 template <typename Kernel>
 __attribute__((noinline,used)) completion_future parallel_for_each(
     const accelerator_view& av, const extent<3>& compute_domain, const Kernel& f) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
   // silently return in case the any dimension of the extent is 0
   if (compute_domain[0] == 0 || compute_domain[1] == 0 || compute_domain[2] == 0)
     return completion_future();
   if (compute_domain[0] < 0 || compute_domain[1] < 0 || compute_domain[2] < 0) {
     throw invalid_compute_domain("Extent is less than 0.");
   }
   if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
     throw invalid_compute_domain("Extent size too large.");
   if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
     throw invalid_compute_domain("Extent size too large.");
   if (static_cast<size_t>(compute_domain[2]) > 4294967295L)
     throw invalid_compute_domain("Extent size too large.");
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
     if (is_cpu()) {
         return launch_cpu_task_async(av.pQueue, f, compute_domain);
     }
 #endif
   size_t ext[3] = {static_cast<size_t>(compute_domain[2]),
                    static_cast<size_t>(compute_domain[1]),
                    static_cast<size_t>(compute_domain[0])};
   if (av.get_accelerator().get_device_path() == L"cpu") {
     throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
   }
   return completion_future(Kalmar::mcw_cxxamp_launch_kernel_async<Kernel, 3>(av.pQueue, ext, NULL, f));
 #else //if __KALMAR_ACCELERATOR__ != 1
   //to ensure functor has right operator() defined
   //this triggers the trampoline code being emitted
   auto foo = &Kernel::__cxxamp_trampoline;
   auto bar = &Kernel::operator();
 #endif
 }
 #pragma clang diagnostic pop

 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma clang diagnostic ignored "-Wunused-variable"
 //1D parallel_for_each, tiled
 template <typename Kernel>
 __attribute__((noinline,used)) completion_future parallel_for_each(
     const accelerator_view& av, const tiled_extent<1>& compute_domain, const Kernel& f) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
   // silently return in case the any dimension of the extent is 0
   if (compute_domain[0] == 0)
     return completion_future();
   if (compute_domain[0] < 0) {
     throw invalid_compute_domain("Extent is less than 0.");
   }
   if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
     throw invalid_compute_domain("Extent size too large.");
   size_t ext = compute_domain[0];
   size_t tile = compute_domain.tile_dim[0];
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
   if (is_cpu()) {
       return launch_cpu_task_async(av.pQueue, f, compute_domain);
   } else
 #endif
   if (av.get_accelerator().get_device_path() == L"cpu") {
     throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
   }
   void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
   return completion_future(Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async<Kernel, 1>(av.pQueue, &ext, &tile, f, kernel, compute_domain.get_dynamic_group_segment_size()));
 #else //if __KALMAR_ACCELERATOR__ != 1
   tiled_index<1> this_is_used_to_instantiate_the_right_index;
   //to ensure functor has right operator() defined
   //this triggers the trampoline code being emitted
   auto foo = &Kernel::__cxxamp_trampoline;
   auto bar = &Kernel::operator();
 #endif
 }
 #pragma clang diagnostic pop

 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma clang diagnostic ignored "-Wunused-variable"
 //2D parallel_for_each, tiled
 template <typename Kernel>
 __attribute__((noinline,used)) completion_future parallel_for_each(
     const accelerator_view& av, const tiled_extent<2>& compute_domain, const Kernel& f) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
   // silently return in case the any dimension of the extent is 0
   if (compute_domain[0] == 0 || compute_domain[1] == 0)
     return completion_future();
   if (compute_domain[0] < 0 || compute_domain[1] < 0) {
     throw invalid_compute_domain("Extent is less than 0.");
   }
   if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
     throw invalid_compute_domain("Extent size too large.");
   if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
     throw invalid_compute_domain("Extent size too large.");
   size_t ext[2] = { static_cast<size_t>(compute_domain[1]),
                     static_cast<size_t>(compute_domain[0])};
   size_t tile[2] = { static_cast<size_t>(compute_domain.tile_dim[1]),
                      static_cast<size_t>(compute_domain.tile_dim[0]) };
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
   if (is_cpu()) {
       return launch_cpu_task_async(av.pQueue, f, compute_domain);
   } else
 #endif
   if (av.get_accelerator().get_device_path() == L"cpu") {
     throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
   }
   void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
   return completion_future(Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async<Kernel, 2>(av.pQueue, ext, tile, f, kernel, compute_domain.get_dynamic_group_segment_size()));
 #else //if __KALMAR_ACCELERATOR__ != 1
   tiled_index<2> this_is_used_to_instantiate_the_right_index;
   //to ensure functor has right operator() defined
   //this triggers the trampoline code being emitted
   auto foo = &Kernel::__cxxamp_trampoline;
   auto bar = &Kernel::operator();
 #endif
 }
 #pragma clang diagnostic pop

 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreturn-type"
 #pragma clang diagnostic ignored "-Wunused-variable"
 //3D parallel_for_each, tiled
 template <typename Kernel>
 __attribute__((noinline,used)) completion_future parallel_for_each(
     const accelerator_view& av, const tiled_extent<3>& compute_domain, const Kernel& f) __CPU__ __HC__ {
 #if __KALMAR_ACCELERATOR__ != 1
   // silently return in case the any dimension of the extent is 0
   if (compute_domain[0] == 0 || compute_domain[1] == 0 || compute_domain[2] == 0)
     return completion_future();
   if (compute_domain[0] < 0 || compute_domain[1] < 0 || compute_domain[2] < 0) {
     throw invalid_compute_domain("Extent is less than 0.");
   }
   if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
     throw invalid_compute_domain("Extent size too large.");
   if (static_cast<size_t>(compute_domain[1]) > 4294967295L)
     throw invalid_compute_domain("Extent size too large.");
   if (static_cast<size_t>(compute_domain[2]) > 4294967295L)
     throw invalid_compute_domain("Extent size too large.");
   size_t ext[3] = { static_cast<size_t>(compute_domain[2]),
                     static_cast<size_t>(compute_domain[1]),
                     static_cast<size_t>(compute_domain[0])};
   size_t tile[3] = { static_cast<size_t>(compute_domain.tile_dim[2]),
                      static_cast<size_t>(compute_domain.tile_dim[1]),
                      static_cast<size_t>(compute_domain.tile_dim[0]) };
 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
   if (is_cpu()) {
       return launch_cpu_task_async(av.pQueue, f, compute_domain);
   } else
 #endif
   if (av.get_accelerator().get_device_path() == L"cpu") {
     throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
   }
   void *kernel = Kalmar::mcw_cxxamp_get_kernel<Kernel>(av.pQueue, f);
   return completion_future(Kalmar::mcw_cxxamp_execute_kernel_with_dynamic_group_memory_async<Kernel, 3>(av.pQueue, ext, tile, f, kernel, compute_domain.get_dynamic_group_segment_size()));
 #else //if __KALMAR_ACCELERATOR__ != 1
   tiled_index<3> this_is_used_to_instantiate_the_right_index;
   //to ensure functor has right operator() defined
   //this triggers the trampoline code being emitted
   auto foo = &Kernel::__cxxamp_trampoline;
   auto bar = &Kernel::operator();
 #endif
 }
 #pragma clang diagnostic pop

 } // namespace hc
hc::__sad_u32_u16x2
unsigned int __sad_u32_u16x2(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Computes the sum of the absolute differences of src0 and src1 and then adds src2 to the result...

hc::__pack_u8x8_u32
uint64_t __pack_u8x8_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...

hc::array_view< const T, N >
The partial specialization array_view<const T,N> represents a view over elements of type const T with...
Definition: hc.hpp:5762

hc::array_view::array_view
array_view(const extent< N > &ext)
Constructs an array_view which is not bound to a data source.
Definition: hc.hpp:5211

hc::accelerator_view::get_is_debug
bool get_is_debug() const
Returns a boolean value indicating whether the accelerator_view supports debugging through extensive ...
Definition: hc.hpp:197

hc::completion_future
This class is the return type of all asynchronous APIs and has an interface analogous to std::shared_...
Definition: hc.hpp:1130

hc::__has_size
Definition: hc.hpp:4064

hc::array::array
array(int e0, int e1, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4387

hc::__unpackhi_u8x8
uint64_t __unpackhi_u8x8(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.

hc::accelerator_view::get_hsa_agent
void * get_hsa_agent()
Returns an opaque handle which points to the underlying HSA agent.
Definition: hc.hpp:478

hc::accelerator_view::flush
void flush()
Sends the queued up commands in the accelerator_view to the device for execution. ...
Definition: hc.hpp:241

hc::array_view::array_view
array_view(const extent< N > &ext, value_type *src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: hc.hpp:5194

hc::extent::extent
extent(const extent &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:1604

hc::array_view< const T, N >::section
array_view< const T, N > section(const extent< N > &ext) const __CPU__ __HC__
Equivalent to "section(index<N>(), ext)".
Definition: hc.hpp:6205

hc::__unpacklo_s8x8
int64_t __unpacklo_s8x8(int64_t src0, int64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.

hc::array_view< const T, N >::value_type
const T value_type
The element type of this array.
Definition: hc.hpp:5781

hc::__bitextract_s64
int64_t __bitextract_s64(int64_t src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits.

hc::tiled_extent< 1 >::tiled_extent
tiled_extent(const extent< 1 > &ext, int t0) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2087

hc::array_view< const T, N >::reinterpret_as
array_view< const ElementType, N > reinterpret_as() const __CPU__ __HC__
This member function is similar to "array<T,N>::reinterpret_as", although it only supports array_view...
Definition: hc.hpp:6249

hc::__lerp_u8x4
unsigned int __lerp_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Do linear interpolation and computes the unsigned 8-bit average of packed data.

hc::array_view< const T, N >::operator()
const T & operator()(const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array_view that is at the location in N-dimensional ...
Definition: hc.hpp:6095

hc::array::array
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4649

hc::tiled_extent< 2 >::set_dynamic_group_segment_size
void set_dynamic_group_segment_size(unsigned int size) __CPU__
Set the size of dynamic group segment.
Definition: hc.hpp:2200

hc::tiled_index< 2 >::tile_dim
const index< 2 > tile_dim
An index of rank 1, 2, 3 that represents the size of the tile.
Definition: hc.hpp:3689

hc::__amdgcn_wave_sl1
float __amdgcn_wave_sl1(float src, bool bound_ctrl)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2943

hc::array::operator=
array & operator=(array &&other)
Moves the contents of the array "other" to this array.
Definition: hc.hpp:4705

hc::array::array
array(int e0, int e1)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:4276

hc::array::array
array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4643

hc::accelerator_view::get_is_auto_selection
bool get_is_auto_selection()
Returns a boolean value indicating whether the accelerator view when passed to a parallel_for_each wo...
Definition: hc.hpp:171

hc::accelerator_view::get_hsa_am_finegrained_system_region
void * get_hsa_am_finegrained_system_region()
Returns an opaque handle which points to the AM system region on the HSA agent.
Definition: hc.hpp:515

hc::array_view::array_view
array_view(int e0, Container &src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5225

hc::tiled_extent< 2 >::tiled_extent
tiled_extent(const extent< 2 > &ext, int t0, int t1, int size) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2192

hc::tiled_index::local
const index< 3 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: hc.hpp:3468

hc::extent::operator==
bool operator==(const extent &other) const __CPU__ __HC__
Compares two objects of extent<N>.
Definition: hc.hpp:1738

hc::tiled_index::tiled_index
tiled_index(const tiled_index &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:3456

hc::accelerator::get_peers
std::vector< accelerator > get_peers() const
Return a std::vector of this accelerator&#39;s peers.
Definition: hc.hpp:1070

hc::array::array
array(const array_view< const T, N > &src, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array initialized with the contents of the array_view "src".
Definition: hc.hpp:4500

hc::tiled_extent< 3 >
Represents an extent subdivided into tiles.
Definition: hc.hpp:2218

hc::atomic_fetch_and
uint64_t atomic_fetch_and(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...

hc::accelerator_view::get_hsa_am_system_region
void * get_hsa_am_system_region()
Returns an opaque handle which points to the AM system region on the HSA agent.
Definition: hc.hpp:503

hc::array_view::operator=
array_view & operator=(const array_view &other) __CPU__ __HC__
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: hc.hpp:5299

hc::array::array
array(const extent< N > &ext, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array with the supplied extent, located on the accelerator bound to the accelerator_...
Definition: hc.hpp:4374

hc::tiled_extent< 3 >::get_dynamic_group_segment_size
unsigned int get_dynamic_group_segment_size() const __CPU__
Return the size of dynamic group segment in bytes.
Definition: hc.hpp:2309

hc::completion_future::get_begin_tick
uint64_t get_begin_tick()
Get the tick number when the underlying asynchronous operation begins.
Definition: hc.hpp:1316

hc::accelerator::get_dedicated_memory
size_t get_dedicated_memory() const
Returns the amount of dedicated memory (in KB) on an accelerator device.
Definition: hc.hpp:907

hc::tiled_extent::tiled_extent
tiled_extent(const tiled_extent &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:2018

hc::__shfl_up
float __shfl_up(float var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__
Copy from an active work-item with lower ID relative to caller within a wavefront.
Definition: hc.hpp:3062

hc::array_view< const T, N >::section
array_view< const T, N > section(const index< N > &idx, const extent< N > &ext) const __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:6187

hc::__unpack_f32_f32x2
float __unpack_f32_f32x2(double src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.

hc::array_view::synchronize_async
completion_future synchronize_async() const
An asynchronous version of synchronize, which returns a completion future object. ...
Definition: hc.hpp:5425

hc::__unpack_u32_u32x2
unsigned int __unpack_u32_u32x2(uint64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.

hc::__unpack_s32_s16x4
int __unpack_s32_s16x4(int64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.

hc::tiled_extent< 2 >::tiled_extent
tiled_extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:2141

hc::tiled_extent< 3 >::tiled_extent
tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2, int size) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2264

hc::__unpacklo_s16x2
int __unpacklo_s16x2(int src0, int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.

hc::accelerator::operator!=
bool operator!=(const accelerator &other) const
Compares "this" accelerator with the passed accelerator object to determine if they represent differe...
Definition: hc.hpp:847

hc::array::operator=
array & operator=(const array &other)
Assigns the contents of the array "other" to this array, using a deep copy.
Definition: hc.hpp:4690

hc::get_tick_frequency
uint64_t get_tick_frequency()
Get the frequency of ticks per second for the underlying asynchrnous operation.
Definition: hc.hpp:103

hc::array::get_associated_accelerator_view
accelerator_view get_associated_accelerator_view() const
This property returns the accelerator_view representing the preferred target where this array can be ...
Definition: hc.hpp:4675

hc::array_view< const T, N >::get_source_accelerator_view
accelerator_view get_source_accelerator_view() const
Access the accelerator_view where the data source of the array_view is located.
Definition: hc.hpp:5908

hc::__lastbit_u32_u32
unsigned int __lastbit_u32_u32(unsigned int input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit.
Definition: hc.hpp:2535

hc::__amdgcn_wave_rl1
float __amdgcn_wave_rl1(float src)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2982

hc::tile_barrier::wait
void wait() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3330

hc::tile_barrier
The tile_barrier class is a capability class that is only creatable by the system, and passed to a tiled parallel_for_each function object as part of the tiled_index parameter.
Definition: hc.hpp:3294

Kalmar::operator/
index< N > operator/(const index< N > &idx, int value)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:559

hc::array_view::section
array_view< T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:5649

hc::accelerator::accelerator
accelerator()
Constructs a new accelerator object that represents the default accelerator.
Definition: hc.hpp:713

hc::array::array
array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: hc.hpp:4572

hc::array_view
The array_view<T,N> type represents a possibly cached view into the data held in an array<T...
Definition: hc.hpp:60

hc::__firstbit_u32_u32
unsigned int __firstbit_u32_u32(unsigned int input) __HC__
Count leading zero bits in the input.
Definition: hc.hpp:2477

hc::extent::operator*=
extent & operator*=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1763

hc::tiled_extent< 3 >::tile_dim
int tile_dim[3]
Tile size for each dimension.
Definition: hc.hpp:2231

hc::tiled_index< 2 >::tiled_index
tiled_index(const tiled_index &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:3655

hc::array::array
array(const extent< N > &ext, InputIter srcBegin)
Constructs a new array with the supplied extent, located on the default accelerator, initialized with the contents of a source container specified by a beginning and optional ending iterator.
Definition: hc.hpp:4298

hc::accelerator::get_hsa_agent
void * get_hsa_agent() const
Returns an opaque handle which points to the underlying HSA agent.
Definition: hc.hpp:1051

hc::__pack_u16x4_u32
uint64_t __pack_u16x4_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...

hc::array::array
array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4523

hc::extent::operator!=
bool operator!=(const extent &other) const __CPU__ __HC__
Compares two objects of extent<N>.
Definition: hc.hpp:1741

hc::__bitrev_b32
unsigned int __bitrev_b32(unsigned int src0)[[hc]] __asm("llvm.bitreverse.i32")
Reverse the bits.

hc::global_memory_fence
void global_memory_fence(const tile_barrier &) __HC__
Establishes a thread-tile scoped memory fence for global (but not tile-static) memory operations...

hc::__unpackhi_s16x4
int64_t __unpackhi_s16x4(int64_t src0, int64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.

hc::array::section
array_view< T, 2 > section(int i0, int i1, int e0, int e1) const __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:4997

hc::array::array
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4326

hc::accelerator::get_hsa_am_finegrained_system_region
void * get_hsa_am_finegrained_system_region() const
Returns an opaque handle which points to the AM system region on the HSA agent.
Definition: hc.hpp:1001

hc::array_view::accelerator_pointer
T * accelerator_pointer() const __CPU__ __HC__
Returns a pointer to the device memory underlying this array_view.
Definition: hc.hpp:5367

hc::tiled_index< 1 >::tile_origin
const index< 1 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: hc.hpp:3581

hc::tiled_extent< 1 >
Represents an extent subdivided into tiles.
Definition: hc.hpp:2031

hc::accelerator_view::get_is_empty
bool get_is_empty()
Returns true if the accelerator_view is currently empty.
Definition: hc.hpp:458

hc::tiled_extent< 2 >::tiled_extent
tiled_extent(const tiled_extent< 2 > &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:2173

hc::array_view< const T, N >::synchronize_to
void synchronize_to(const accelerator_view &av) const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:6056

hc::array_view< const T, N >::operator=
array_view & operator=(const array_view< T, N > &other) __CPU__ __HC__
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: hc.hpp:5919

hc::array::copy_to
void copy_to(const array_view< T, N > &dest) const
Copies the contents of this array to the array_view given by "dest", as if by calling "copy(*this...
Definition: hc.hpp:4753

hc::extent::operator*=
extent & operator*=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  value.
Definition: hc.hpp:1824

hc::tiled_index< 2 >::tile_origin
const index< 2 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: hc.hpp:3679

hc::extent::extent
extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:1594

hc::array_view::array_view
array_view(const extent< N > &extent, Container &src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: hc.hpp:5180

hc::extent
Represents a unique position in N-dimensional space.
Definition: hc.hpp:58

hc::atomic_fetch_min
uint64_t atomic_fetch_min(uint64_t *dest, uint64_t val) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...

hc::accelerator_view::get_hsa_kernarg_region
void * get_hsa_kernarg_region()
Returns an opaque handle which points to the Kernarg region on the HSA agent.
Definition: hc.hpp:526

hc::extent::operator+=
extent & operator+=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1755

hc::accelerator::operator=
accelerator & operator=(const accelerator &other)
Assigns an accelerator object to "this" accelerator object and returns a reference to "this" object...
Definition: hc.hpp:802

hc::do_copy
Definition: hc.hpp:6438

hc::tiled_extent< 3 >::tiled_extent
tiled_extent(const extent< 3 > &ext, int t0, int t1, int t2, int size) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2294

hc::array::section
array_view< T, N > section(const index< N > &origin, const extent< N > &ext) __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:4930

hc::tiled_extent< 1 >::tiled_extent
tiled_extent(const tiled_extent< 1 > &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:2078

hc::copy_input
Definition: hc.hpp:6342

hc::__amdgcn_mbcnt_lo
int __amdgcn_mbcnt_lo(int mask, int src)[[hc]] __asm("llvm.amdgcn.mbcnt.lo")
Direct copy from indexed active work-item within a wavefront.

hc::array_view< const T, N >::synchronize
void synchronize() const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:6028

hc::accelerator::get_version
unsigned int get_version() const
Returns a 32-bit unsigned integer representing the version number of this accelerator.
Definition: hc.hpp:890

hc::__mul24
int __mul24(int x, int y)[[hc]]
Multiply two integers (x,y) but only the lower 24 bits will be used in the multiplication.
Definition: hc.hpp:3184

hc::accelerator::get_is_peer
bool get_is_peer(const accelerator &other) const
Check if other is peer of this accelerator.
Definition: hc.hpp:1061

hc::array::operator()
T & operator()(int i0, int i1) __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:4852

std
STL namespace.

hc::__unpacklo_s32x2
int64_t __unpacklo_s32x2(int64_t src0, int64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.

hc::__shfl
float __shfl(float var, int srcLane, int width=__HSA_WAVEFRONT_SIZE__) __HC__
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:3014

hc::array_view< const T, N >::view_as
array_view< const T, K > view_as(extent< K > viewExtent) const __CPU__ __HC__
This member function is similar to "array<T,N>::view_as", although it only supports array_views of ra...
Definition: hc.hpp:6272

hc::tile_barrier::wait_with_tile_static_memory_fence
void wait_with_tile_static_memory_fence() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3382

hc::__u
Definition: hc.hpp:2809

hc::pfe_helper
Definition: hc.hpp:7530

hc::__is_container
Definition: hc.hpp:4075

Kalmar::accelerator_view_removed
Definition: kalmar_exception.h:51

hc::tiled_index< 2 >::global
const index< 2 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: hc.hpp:3661

hc::tiled_index::tile
const index< 3 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: hc.hpp:3474

hc::pfe_wrapper
Definition: hc.hpp:7551

hc::array::operator()
array_projection_helper< T, N >::result_type operator()(int i0) __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: hc.hpp:4898

hc::atomic_fetch_inc
unsigned int atomic_fetch_inc(unsigned int *_Dest) __CPU__ __HC__
Atomically increment or decrement the value stored at the location point to by dest.

hc::accelerator::set_default
static bool set_default(const std::wstring &path)
Sets the default accelerator to the device path identified by the "path" argument.
Definition: hc.hpp:770

hc::__clock_u64
uint64_t __clock_u64() __HC__
Get system timestamp.

hc::__popcount_u32_b32
unsigned int __popcount_u32_b32(unsigned int input) __HC__
Count number of 1 bits in the input.
Definition: hc.hpp:2389

hc::tile_barrier::wait_with_all_memory_fence
void wait_with_all_memory_fence() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3347

hc::__bitextract_u64
uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits.

hc::array::section
array_view< const T, N > section(const index< N > &origin, const extent< N > &ext) const __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:4938

hc::array_view< const T, N >::section
array_view< const T, N > section(const index< N > &idx) const __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)".
Definition: hc.hpp:6196

hc::accelerator::get_hsa_kernarg_region
void * get_hsa_kernarg_region() const
Returns an opaque handle which points to the Kernarg region on the HSA agent.
Definition: hc.hpp:1012

hc::get_group_segment_base_pointer
void * get_group_segment_base_pointer() __HC__
Fetch the address of the beginning of group segment.

hc::accelerator::get_cu_count
unsigned int get_cu_count() const
Return the compute unit count of the accelerator.
Definition: hc.hpp:1087

hc::array_view::section
array_view< T, N > section(const extent< N > &ext) const __CPU__ __HC__
Equivalent to "section(index<N>(), ext)".
Definition: hc.hpp:5624

hc::__sad_u32_u8x4
unsigned int __sad_u32_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Computes the sum of the absolute differences of src0 and src1 and then adds src2 to the result...

hc::accelerator::get_hsa_am_region
void * get_hsa_am_region() const
Returns an opaque handle which points to the AM region on the HSA agent.
Definition: hc.hpp:977

hc::array::array
array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4520

hc::extent::operator+=
extent & operator+=(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: hc.hpp:1796

hc::array_view::copy_to
void copy_to(array< T, N > &dest) const
Copies the data referred to by this array_view to the array given by "dest", as if by calling "copy(*...
Definition: hc.hpp:5317

hc::array::operator()
T & operator()(const index< N > &idx) __CPU__ __HC__
Returns a reference to the element of this array that is at the location in N-dimensional space speci...
Definition: hc.hpp:4812

Kalmar
namespace for internal classes of Kalmar compiler / runtime
Definition: hc.hpp:42

hc::tiled_index< 1 >::barrier
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: hc.hpp:3586

hc::array::array
array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4652

hc::tile_barrier::wait_with_global_memory_fence
void wait_with_global_memory_fence() const __HC__
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: hc.hpp:3364

hc::__amdgcn_ds_swizzle
float __amdgcn_ds_swizzle(float src, int pattern)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2896

hc::tiled_index::tile_dim
const index< 3 > tile_dim
An index of rank 1, 2, 3 that represents the size of the tile.
Definition: hc.hpp:3490

hc::array_view< const T, N >::accelerator_pointer
T * accelerator_pointer() const __CPU__ __HC__
Returns a pointer to the device memory underlying this array_view.
Definition: hc.hpp:5988

hc::array::array
array(int e0, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4385

hc::__sad_u32_u32
unsigned int __sad_u32_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Computes the sum of the absolute differences of src0 and src1 and then adds src2 to the result...

hc::__unpackhi_s8x8
int64_t __unpackhi_s8x8(int64_t src0, int64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.

hc::accelerator::get_device_path
std::wstring get_device_path() const
Returns a system-wide unique device instance path that matches the "Device Instance Path" property fo...
Definition: hc.hpp:877

hc::extent::size
unsigned int size() const __CPU__ __HC__
This member function returns the total linear size of this extent<N> (in units of elements)...
Definition: hc.hpp:1695

hc::accelerator_view::dispatch_hsa_kernel
void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, const void *args, size_t argsize, hc::completion_future *cf=nullptr, const char *kernel_name=nullptr)
Dispatch a kernel into the accelerator_view.
Definition: hc.hpp:597

hc::array_view< const T, N >::array_view
array_view(const extent< N > &ext, const value_type *src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: hc.hpp:5830

hc::__unpacklo_u16x2
unsigned int __unpacklo_u16x2(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.

hc::array::array
array(int e0, int e1, int e2)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:4278

hc::__lastbit_u32_u64
unsigned int __lastbit_u32_u64(unsigned long long int input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit.
Definition: hc.hpp:2539

hc::get_dynamic_group_segment_base_pointer
void * get_dynamic_group_segment_base_pointer() __HC__
Fetch the address of the beginning of dynamic group segment.

hc::array_view::discard_data
void discard_data() const
Indicates to the runtime that it may discard the current logical contents of this array_view...
Definition: hc.hpp:5495

hc::accelerator_view::get_queuing_mode
queuing_mode get_queuing_mode() const
Returns the queuing mode that this accelerator_view was created with.
Definition: hc.hpp:152

hc::array_view< const T, N >::synchronize_async
completion_future synchronize_async() const
An asynchronous version of synchronize, which returns a completion future object. ...
Definition: hc.hpp:6040

hc::tiled_extent< 3 >::tiled_extent
tiled_extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:2237

hc::tiled_index< 1 >::global
const index< 1 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: hc.hpp:3563

CLAMP

hc::__bitselect_b64
uint64_t __bitselect_b64(uint64_t src0, uint64_t src1, uint64_t src2) __HC__
Do bit field selection.

hc::__pack_f32x2_f32
double __pack_f32x2_f32(double src0, float src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...

hc::__has_data
Definition: hc.hpp:4053

hc::array_view::section
array_view< T, N > section(const index< N > &idx) const __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)".
Definition: hc.hpp:5615

hc::accelerator_view::operator=
accelerator_view & operator=(const accelerator_view &other)
Assigns an accelerator_view object to "this" accelerator_view object and returns a reference to "this...
Definition: hc.hpp:141

hc::array_view< const T, N >::array_view
array_view(const array_view &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:5891

hc::extent::value_type
int value_type
The element type of extent<N>.
Definition: hc.hpp:1588

hc::tiled_extent< 2 >::tile_dim
int tile_dim[2]
Tile size for each dimension.
Definition: hc.hpp:2135

hc::array::array
array(int e0)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:4274

hc::array_view< const T, N >::refresh
void refresh() const
Calling this member function informs the array_view that its bound memory has been modified outside t...
Definition: hc.hpp:5997

hc::atomic_exchange
uint64_t atomic_exchange(uint64_t *dest, uint64_t val) __CPU__ __HC__
Atomically read the value stored in dest , replace it with the value given in val and return the old ...

hc::accelerator::has_cpu_accessible_am
bool has_cpu_accessible_am()
Return true if the accelerator&#39;s memory can be mapped into the CPU&#39;s address space, and the CPU is allowed to access the memory directly with CPU memory operations.
Definition: hc.hpp:1106

hc::array_projection_helper
Definition: hc.hpp:4087

hc::array::section
array_view< const T, N > section(const index< N > &idx) const __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)".
Definition: hc.hpp:4957

hc::accelerator_view::get_accelerator
accelerator get_accelerator() const
Returns the accelerator that this accelerator_view has been created on.
Definition: hc.hpp:1461

hc::array::get_extent
extent< N > get_extent() const __CPU__ __HC__
Access the extent that defines the shape of this array.
Definition: hc.hpp:4663

hc::extent::operator%=
extent & operator%=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  value.
Definition: hc.hpp:1832

hc::tiled_extent< 2 >::tiled_extent
tiled_extent(int e0, int e1, int t0, int t1) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2152

hc::tiled_extent::tiled_extent
tiled_extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:2009

Kalmar::runtime_exception
Definition: kalmar_exception.h:22

hc::atomic_fetch_max
uint64_t atomic_fetch_max(uint64_t *dest, uint64_t val) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...

hc::__amdgcn_mbcnt_hi
int __amdgcn_mbcnt_hi(int mask, int src)[[hc]] __asm("llvm.amdgcn.mbcnt.hi")
Direct copy from indexed active work-item within a wavefront.

hc::accelerator_view::accelerator_view
accelerator_view(const accelerator_view &other)
Copy-constructs an accelerator_view object.
Definition: hc.hpp:129

hc::extent::operator-=
extent & operator-=(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: hc.hpp:1800

hc::array_view< const T, N >::section
array_view< const T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:6230

hc::array::view_as
array_view< const T, K > view_as(const extent< K > &viewExtent) const __CPU__ __HC__
An array of higher rank can be reshaped into an array of lower rank, or vice versa, using the view_as member function.
Definition: hc.hpp:5087

hc::array_view::get_source_accelerator_view
accelerator_view get_source_accelerator_view() const
Access the accelerator_view where the data source of the array_view is located.
Definition: hc.hpp:5289

hc::completion_future::wait
void wait(hcWaitMode mode=hcWaitModeBlocked) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: hc.hpp:1235

hc::array::array
array(const array &other)
Copy constructor.
Definition: hc.hpp:4242

Kalmar::operator-
index< N > operator-(const index< N > &lhs, const index< N > &rhs)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:498

hc::get_static_group_segment_size
unsigned int get_static_group_segment_size() __HC__
Fetch the size of static group segment.

hc::array_view::reinterpret_as
array_view< ElementType, N > reinterpret_as() const __CPU__ __HC__
This member function is similar to "array<T,N>::reinterpret_as", although it only supports array_view...
Definition: hc.hpp:5668

hc::tiled_extent< 2 >::get_dynamic_group_segment_size
unsigned int get_dynamic_group_segment_size() const __CPU__
Return the size of dynamic group segment in bytes.
Definition: hc.hpp:2207

hc::__unpacklo_u8x8
uint64_t __unpacklo_u8x8(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.

hc::accelerator::get_profile
hcAgentProfile get_profile() const
Returns the profile the accelerator.
Definition: hc.hpp:1029

hc::tiled_extent< 1 >::tiled_extent
tiled_extent() __CPU__ __HC__
Default constructor.
Definition: hc.hpp:2050

hc::array_view< const T, N >::copy_to
void copy_to(const array_view< T, N > &dest) const
Copies the contents of this array_view to the array_view given by "dest", as if by calling "copy(*thi...
Definition: hc.hpp:5957

hc::tiled_index
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: hc.hpp:3441

hc::completion_future::completion_future
completion_future(const completion_future &other)
Copy constructor.
Definition: hc.hpp:1147

hc::array_view< const T, N >::section
array_view< const T, 1 > section(int i0, int e0) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:6220

hc::accelerator::get_default_view
accelerator_view get_default_view() const
Returns the default accelerator_view associated with the accelerator.
Definition: hc.hpp:813

hc::extent::extent
extent(const int components[]) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided the array of int component values...
Definition: hc.hpp:1636

hc::array_view::array_view
array_view(array< T, N > &src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" array. ...
Definition: hc.hpp:5160

hc::accelerator::accelerator
accelerator(const accelerator &other)
Copy constructs an accelerator object.
Definition: hc.hpp:740

hc::array::array
array(int e0, int e1, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: hc.hpp:4570

hc::__unpackhi_u8x4
unsigned int __unpackhi_u8x4(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.

hc::array::array
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4532

hc::tiled_index::barrier
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: hc.hpp:3485

hc::array::operator[]
const T & operator[](const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array that is at the location in N-dimensional space...
Definition: hc.hpp:4829

hc::__bitselect_b32
unsigned int __bitselect_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Do bit field selection.

hc::array::array
array(const extent< N > &ext, accelerator_view av, void *accelerator_pointer, access_type cpu_access_type=access_type_auto)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4405

hc::array_view< const T, N >::copy_to
void copy_to(array< T, N > &dest) const
Copies the data referred to by this array_view to the array given by "dest", as if by calling "copy(*...
Definition: hc.hpp:5948

hc::extent::contains
bool contains(const index< N > &idx) const __CPU__ __HC__
Tests whether the index "idx" is properly contained within this extent (with an assumed origin of zer...
Definition: hc.hpp:1686

hc::__activelanemask_v4_b64_b1
uint64_t __activelanemask_v4_b64_b1(unsigned int input) __HC__
Return a bit mask shows which active work-items in the wavefront have a non-zero input.

hc::tiled_index< 2 >::barrier
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: hc.hpp:3684

hc::__unpack_u32_u8x8
unsigned int __unpack_u32_u8x8(uint64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.

hc::tiled_index< 1 >
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: hc.hpp:3542

hc::accelerator_view::copy
void copy(const void *src, void *dst, size_t size_bytes)
Copies size_bytes bytes from src to dst.
Definition: hc.hpp:343

hc::copy
void copy(const array_view< const T, 1 > &src, const array_view< T, 1 > &dest)
The contents of "src" are copied into "dest".
Definition: hc.hpp:6694

hc::extent::operator-
extent operator-(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: hc.hpp:1791

hc::extent::operator--
extent operator--(int) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  1.
Definition: hc.hpp:1860

hc::tiled_index< 1 >::tile_dim
const index< 1 > tile_dim
An index of rank 1, 2, 3 that represents the size of the tile.
Definition: hc.hpp:3591

hc::array_view::section
array_view< T, 1 > section(int i0, int e0) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:5639

hc::accelerator::set_default_cpu_access_type
bool set_default_cpu_access_type(access_type type)
Sets the default_cpu_access_type for this accelerator.
Definition: hc.hpp:867

hc::array_view< const T, N >::data
const T * data() const __CPU__ __HC__
Returns a pointer to the first data element underlying this array_view.
Definition: hc.hpp:5974

hc::completion_future::completion_future
completion_future()
Default constructor.
Definition: hc.hpp:1138

hc::array::operator()
const T & operator()(const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array that is at the location in N-dimensional space...
Definition: hc.hpp:4838

hc::accelerator::get_description
std::wstring get_description() const
Returns a short textual description of the accelerator device.
Definition: hc.hpp:882

Kalmar::invalid_compute_domain
Definition: kalmar_exception.h:42

hc::array_view::value_type
T value_type
The element type of this array.
Definition: hc.hpp:5145

hc::array::operator[]
T & operator[](const index< N > &idx) __CPU__ __HC__
Returns a reference to the element of this array that is at the location in N-dimensional space speci...
Definition: hc.hpp:4803

hc::__bitmask_b32
unsigned int __bitmask_b32(unsigned int src0, unsigned int src1) __HC__
Create a bit mask that can be used with bitselect.

hc::__bitinsert_u32
unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits.

hc::array::section
array_view< T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:5005

hc::array_view< const T, N >::array_view
array_view(int e0, const value_type *src) __CPU__ __HC__
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5866

hc::completion_future::get_native_handle
void * get_native_handle() const
Get the native handle for the asynchronous operation encapsulated in this completion_future object...
Definition: hc.hpp:1302

hc::__packcvt_u8x4_f32
unsigned int __packcvt_u8x4_f32(float src0, float src1, float src2, float src3) __HC__
Takes four floating-point number, convers them to unsigned integer values, and packs them into a pack...

Kalmar::precise_math
Definition: kalmar_math.h:691

hc::array_view< const T, N >::array_view
array_view(const array_view< nc_T, N > &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:5881

hc::array_view< const T, N >::array_view
array_view(int e0, Container &src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5848

hc::array::array
array(const extent< N > &ext)
Constructs a new array with the supplied extent, located on the default view of the default accelerat...
Definition: hc.hpp:4264

hc::__mad24
int __mad24(int x, int y, int z)[[hc]]
Multiply two integers (x,y) but only the lower 24 bits will be used in the multiplication and then ad...
Definition: hc.hpp:3210

hc::__unpackcvt_f32_u8x4
float __unpackcvt_f32_u8x4(unsigned int src0, unsigned int src1) __HC__
Unpacks a single element from a packed u8x4 value and converts it to an f32.

hc::array::array
array(int e0, int e1, int e2, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4389

hc::__amdgcn_ds_permute
float __amdgcn_ds_permute(int index, float src)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2880

hc::__popcount_u32_b64
unsigned int __popcount_u32_b64(unsigned long long int input) __HC__
Count number of 1 bits in the input.
Definition: hc.hpp:2399

hc::array::array
array(int e0, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4317

hc::array::array
array(const array_view< const T, N > &src, accelerator_view av, accelerator_view associated_av)
Constructs a staging array initialized with the array_view given by "src", which acts as a staging ar...
Definition: hc.hpp:4621

hc::tiled_extent< 2 >
Represents an extent subdivided into tiles.
Definition: hc.hpp:2122

hc::extent::operator+=
extent & operator+=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  value.
Definition: hc.hpp:1816

hc::completion_future::then
void then(const functor &func)
This method enables specification of a completion callback func which is executed upon completion of ...
Definition: hc.hpp:1279

hc::tiled_extent< 3 >::tiled_extent
tiled_extent(const extent< 3 > &ext, int t0, int t1, int t2) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2283

hc::array::array
array(int e0, int e1, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: hc.hpp:4425

hc::array::array
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4526

hc::__unpacklo_s16x4
int64_t __unpacklo_s16x4(int64_t src0, int64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.

hc::accelerator_view
Represents a logical (isolated) accelerator view of a compute accelerator.
Definition: hc.hpp:120

hc::tiled_extent< 1 >::tile_dim
int tile_dim[1]
Tile size for each dimension.
Definition: hc.hpp:2044

hc::array
Represents an N-dimensional region of memory (with type T) located on an accelerator.
Definition: hc.hpp:61

hc::array::operator()
const T & operator()(int i0, int i1) const __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: hc.hpp:4869

hc::tiled_extent< 1 >::tiled_extent
tiled_extent(const extent< 1 > &ext, int t0, int size) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2096

hc::completion_future::get_tick_frequency
uint64_t get_tick_frequency()
Get the frequency of ticks per second for the underlying asynchrnous operation.
Definition: hc.hpp:1344

hc::accelerator::get_default_cpu_access_type
access_type get_default_cpu_access_type() const
Get the default cpu access_type for buffers created on this accelerator.
Definition: hc.hpp:946

hc::__unpackhi_u16x2
unsigned int __unpackhi_u16x2(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.

hc::array::array
array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4646

hc::get_system_ticks
uint64_t get_system_ticks()
Get the current tick count for the GPU platform.
Definition: hc.hpp:93

hc::copy_bidir
Definition: hc.hpp:6400

hc::accelerator_view::operator==
bool operator==(const accelerator_view &other) const
Compares "this" accelerator_view with the passed accelerator_view object to determine if they represe...
Definition: hc.hpp:419

hc::array::view_as
array_view< T, K > view_as(const extent< K > &viewExtent) __CPU__ __HC__
An array of higher rank can be reshaped into an array of lower rank, or vice versa, using the view_as member function.
Definition: hc.hpp:5078

hc::array_view< const T, N >::get_extent
extent< N > get_extent() const __CPU__ __HC__
Access the extent that defines the shape of this array_view.
Definition: hc.hpp:5897

hc::tiled_extent
Represents an extent subdivided into tiles.
Definition: hc.hpp:59

hc::array::section
array_view< T, N > section(const index< N > &idx) __CPU__ __HC__
Equivalent to "section(idx, this->extent – idx)".
Definition: hc.hpp:4949

hc::completion_future::get_end_tick
uint64_t get_end_tick()
Get the tick number when the underlying asynchronous operation ends.
Definition: hc.hpp:1330

hc::array::array
array(int e0, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4320

hc::accelerator_view::get_hsa_am_region
void * get_hsa_am_region()
Returns an opaque handle which points to the AM region on the HSA agent.
Definition: hc.hpp:490

hc::accelerator_view::get_execute_order
execute_order get_execute_order() const
Returns the execution order of this accelerator_view.
Definition: hc.hpp:157

hc::extent::extent
extent(int e0) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided by .
Definition: hc.hpp:1616

hc::array::array
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4655

hc::completion_future::completion_future
completion_future(completion_future &&other)
Move constructor.
Definition: hc.hpp:1159

hc::accelerator::is_hsa_accelerator
bool is_hsa_accelerator() const
Returns if the accelerator is based on HSA.
Definition: hc.hpp:1019

hc::__unpack_u32_u8x4
unsigned int __unpack_u32_u8x4(unsigned int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.

hc::__bitextract_s32
int __bitextract_s32(int src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits.

hc::accelerator::get_supports_limited_double_precision
bool get_supports_limited_double_precision() const
Returns a boolean value indicating whether the accelerator has limited double precision support (excl...
Definition: hc.hpp:922

hc::tiled_index< 1 >::tile
const index< 1 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: hc.hpp:3575

hc::array::array
array(const extent< N > &ext, InputIter srcBegin, InputIter srcEnd)
Constructs a new array with the supplied extent, located on the default accelerator, initialized with the contents of a source container specified by a beginning and optional ending iterator.
Definition: hc.hpp:4301

hc::atomic_fetch_add
uint64_t atomic_fetch_add(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...

hc::extent::extent
extent(_Tp...__t) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided by .
Definition: hc.hpp:1620

hc::array_view::array_view
array_view(int e0)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:5258

hc::__unpacklo_s8x4
int __unpacklo_s8x4(int src0, int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.

hc::tiled_extent< 3 >::tiled_extent
tiled_extent(int e0, int e1, int e2, int t0, int t1, int t2) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2250

hc::accelerator::create_view
accelerator_view create_view(execute_order order=execute_in_order, queuing_mode mode=queuing_mode_automatic)
Creates and returns a new accelerator view on the accelerator with the supplied queuing mode...
Definition: hc.hpp:823

hc::array::section
array_view< const T, 1 > section(int i0, int e0) const __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:4993

hc::array_view::refresh
void refresh() const
Calling this member function informs the array_view that its bound memory has been modified outside t...
Definition: hc.hpp:5376

hc::__unpack_s32_s8x4
int __unpack_s32_s8x4(int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.

hc::array::operator()
array_projection_helper< T, N >::const_result_type operator()(int i0) const __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: hc.hpp:4906

hc::tiled_index< 1 >::local
const index< 1 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: hc.hpp:3569

hc::accelerator_view::get_pending_async_ops
int get_pending_async_ops()
Returns the number of pending asynchronous operations on this accelerator view.
Definition: hc.hpp:447

hc::completion_future::get_use_count
int get_use_count() const
Definition: hc.hpp:1381

hc::accelerator::get_has_display
bool get_has_display() const
This property indicates that the accelerator may be shared by (and thus have interference from) the o...
Definition: hc.hpp:900

hc::__bitextract_u32
unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Extract a range of bits.
Definition: hc.hpp:2409

hc::tiled_extent< 1 >::tiled_extent
tiled_extent(int e0, int t0) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2059

Kalmar::operator+
index< N > operator+(const index< N > &lhs, const index< N > &rhs)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:492

hc::extent::extent
extent(int components[]) __CPU__ __HC__
Constructs an extent<N> with the coordinate values provided the array of int component values...
Definition: hc.hpp:1647

hc::completion_future::wait_for
std::future_status wait_for(const std::chrono::duration< _Rep, _Period > &_Rel_time) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: hc.hpp:1248

hc::__pack_u16x2_u32
unsigned __pack_u16x2_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...

hc::array_view< const T, N >::section
array_view< const T, 2 > section(int i0, int i1, int e0, int e1) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:6225

hc::accelerator_view::wait
void wait(hcWaitMode waitMode=hcWaitModeBlocked)
Performs a blocking wait for completion of all commands submitted to the accelerator view prior to ca...
Definition: hc.hpp:208

hc::__unpacklo_u8x4
unsigned int __unpacklo_u8x4(unsigned int src0, unsigned int src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.

hc::array::get_accelerator_view
accelerator_view get_accelerator_view() const
This property returns the accelerator_view representing the location where this array has been alloca...
Definition: hc.hpp:4669

hc::__atomic_wrapinc
unsigned int __atomic_wrapinc(unsigned int *address, unsigned int val) __HC__
Atomically do the following operations:

hc::array_view::synchronize_to
void synchronize_to(const accelerator_view &av) const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:5464

hc::array::array
array(const extent< N > &ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator_vi...
Definition: hc.hpp:4596

Kalmar::index
Represents a unique position in N-dimensional space.
Definition: kalmar_index.h:226

hc::array::array
array(const extent< N > &ext, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator vi...
Definition: hc.hpp:4549

hc::__bitinsert_u64
uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits.

hc::array::section
array_view< const T, N > section(const extent< N > &ext) const __CPU__ __HC__
Equivalent to "section(index<N>(), ext)".
Definition: hc.hpp:4972

hc::__unpack_s32_s8x8
int __unpack_s32_s8x8(int64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.

hc::accelerator::get_max_tile_static_size
size_t get_max_tile_static_size()
Returns the maximum size of tile static area available on this accelerator.
Definition: hc.hpp:953

hc::accelerator_view::operator!=
bool operator!=(const accelerator_view &other) const
Compares "this" accelerator_view with the passed accelerator_view object to determine if they represe...
Definition: hc.hpp:431

hc::array_view::array_view
array_view(int e0, value_type *src) __CPU__ __HC__
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: hc.hpp:5244

hc::extent::operator/=
extent & operator/=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1767

hc::tiled_index< 2 >::tile
const index< 2 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: hc.hpp:3673

hc::array::copy_to
void copy_to(array &dest) const
Copies the contents of this array to the array given by "dest", as if by calling "copy(*this, dest)".
Definition: hc.hpp:4735

hc::extent::operator/=
extent & operator/=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  value.
Definition: hc.hpp:1828

hc::tiled_extent< 2 >::tiled_extent
tiled_extent(int e0, int e1, int t0, int t1, int size) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2164

hc::array::operator=
array & operator=(const array_view< T, N > &src)
Assigns the contents of the array_view "src", as if by calling "copy(src, *this)".
Definition: hc.hpp:4722

hc::__any
int __any(int predicate) __HC__
Evaluate predicate for all active work-items in the wavefront and return non-zero if and only if pred...
Definition: hc.hpp:2781

hc::tiled_extent< 2 >::tiled_extent
tiled_extent(const extent< 2 > &ext, int t0, int t1) __CPU__ __HC__
Constructs a tiled_extent<N> with the extent "ext".
Definition: hc.hpp:2182

hc::__pack_s32x2_s32
int64_t __pack_s32x2_s32(int64_t src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...

hc::__bytealign_b32
unsigned int __bytealign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Align 32 bits within 64 bis of data on an arbitrary byte boundary.

hc::extent::operator%=
extent & operator%=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1771

hc::__lastbit_u32_s32
unsigned int __lastbit_u32_s32(int input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit.
Definition: hc.hpp:2543

hc::array_view< const T, N >::operator()
const T & operator()(int i0, int i1, int i2) const __CPU__ __HC__
Equivalent to "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:6132

hc::__firstbit_u32_s32
unsigned int __firstbit_u32_s32(int input) __HC__
Count leading zero bits in the input.
Definition: hc.hpp:2502

hc::completion_future::is_ready
bool is_ready()
Get if the async operations has been completed.
Definition: hc.hpp:1357

hc::array::get_cpu_access_type
access_type get_cpu_access_type() const
This property returns the CPU "access_type" allowed for this array.
Definition: hc.hpp:4680

hc::array_view::operator()
T & operator()(const index< N > &idx) const __CPU__ __HC__
Returns a reference to the element of this array_view that is at the location in N-dimensional space ...
Definition: hc.hpp:5517

hc::tiled_extent< 3 >::tiled_extent
tiled_extent(const tiled_extent< 3 > &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:2273

hc::__unpackhi_u32x2
uint64_t __unpackhi_u32x2(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.

hc::array::array
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4332

hc::tiled_index< 2 >
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: hc.hpp:3640

hc::__cycle_u64
uint64_t __cycle_u64() __HC__
Get hardware cycle count.

hc::get_group_segment_size
unsigned int get_group_segment_size() __HC__
Fetch the size of group segment.

hc::__amdgcn_wave_rr1
float __amdgcn_wave_rr1(float src)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2963

hc::__bitmask_b64
uint64_t __bitmask_b64(unsigned int src0, unsigned int src1) __HC__
Create a bit mask that can be used with bitselect.

hc::array::array
array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: hc.hpp:4427

hc::__pack_u32x2_u32
uint64_t __pack_u32x2_u32(uint64_t src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...

hc::extent::operator--
extent & operator--() __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  1.
Definition: hc.hpp:1856

hc::completion_future::operator=
completion_future & operator=(completion_future &&_Other)
Move assignment.
Definition: hc.hpp:1188

hc::array::array
array(int e0, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: hc.hpp:4568

hc::array_view< const T, N >::operator[]
const T & operator[](const index< N > &idx) const __CPU__ __HC__
Returns a const reference to the element of this array_view that is at the location in N-dimensional ...
Definition: hc.hpp:6088

hc::__sadhi_u16x2_u8x4
unsigned int __sadhi_u16x2_u8x4(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
This function is mostly the same as sad except the sum of absolute differences is added to the most s...

hc::tiled_extent< 1 >::tiled_extent
tiled_extent(int e0, int t0, int size) __CPU__ __HC__
Construct an tiled extent with the size of extent and the size of tile specified. ...
Definition: hc.hpp:2069

hc::__amdgcn_ds_bpermute
float __amdgcn_ds_bpermute(int index, float src)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2865

hc::accelerator_view::set_cu_mask
bool set_cu_mask(const std::vector< bool > &cu_mask)
Set a CU affinity to specific command queues.
Definition: hc.hpp:618

hc::array::data
T * data() const __CPU__ __HC__
Returns a pointer to the raw data underlying this array.
Definition: hc.hpp:4760

hc::array_view::data
T * data() const __CPU__ __HC__
Returns a pointer to the first data element underlying this array_view.
Definition: hc.hpp:5352

hc::__bitinsert_s32
int __bitinsert_s32(int src0, int src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits.

hc::__ballot
uint64_t __ballot(int predicate) __HC__
Evaluate predicate for all active work-items in the wavefront and return an integer whose Nth bit is ...
Definition: hc.hpp:2800

hc::__pack_s8x4_s32
int __pack_s8x4_s32(int src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...

hc::array::array
array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: hc.hpp:4640

hc::extent::operator++
extent & operator++() __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  1.
Definition: hc.hpp:1847

Kalmar::operator*
index< N > operator*(const index< N > &idx, int value)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:547

hc::array::section
array_view< T, 2 > section(int i0, int i1, int e0, int e1) __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:5001

Kalmar::operator%
index< N > operator%(const index< N > &idx, int value)
Binary arithmetic operations that produce a new index<N> that is the result of performing the corresp...
Definition: kalmar_index.h:571

hc::array_view::get_extent
extent< N > get_extent() const __CPU__ __HC__
Access the extent that defines the shape of this array_view.
Definition: hc.hpp:5278

hc::array::array
array(int e0, int e1, int e2, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4329

hc::__unpackhi_s32x2
int64_t __unpackhi_s32x2(int64_t src0, int64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.

hc::__activelanecount_u32_b1
unsigned int __activelanecount_u32_b1(unsigned int input) __HC__
Count the number of active work-items in the current wavefront that have a non-zero input...
Definition: hc.hpp:2768

hc::array::array
array(const array_view< const T, N > &src)
Constructs a new array, located on the default view of the default accelerator, initialized with the ...
Definition: hc.hpp:4348

hc::projection_helper
Definition: hc.hpp:3929

hc::array::array
array(const extent< N > &ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator_vi...
Definition: hc.hpp:4593

hc::extent::operator-=
extent & operator-=(const extent &__r) __CPU__ __HC__
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: hc.hpp:1759

hc::__bitrev_b64
uint64_t __bitrev_b64(uint64_t src0)[[hc]] __asm("llvm.bitreverse.i64")
Reverse the bits.

hc::accelerator::get_supports_cpu_shared_memory
bool get_supports_cpu_shared_memory() const
Returns a boolean value indicating whether the accelerator supports memory accessible both by the acc...
Definition: hc.hpp:941

hc::__unpacklo_u32x2
uint64_t __unpacklo_u32x2(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.

hc::__amdgcn_wave_sr1
float __amdgcn_wave_sr1(float src, bool bound_ctrl)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2923

hc::__all
int __all(int predicate) __HC__
Evaluate predicate for all active work-items in the wavefront and return non-zero if and only if pred...
Definition: hc.hpp:2790

hc::accelerator::accelerator
accelerator(const std::wstring &path)
Constructs a new accelerator object that represents the physical device named by the "path" argument...
Definition: hc.hpp:730

hc::tiled_extent< 3 >::set_dynamic_group_segment_size
void set_dynamic_group_segment_size(unsigned int size) __CPU__
Set the size of dynamic group segment.
Definition: hc.hpp:2302

hc::accelerator::get_is_emulated
bool get_is_emulated() const
Returns a boolean value indicating whether the accelerator is emulated.
Definition: hc.hpp:935

hc::copy_output
Definition: hc.hpp:6372

hc::atomic_compare_exchange
bool atomic_compare_exchange(uint64_t *dest, uint64_t *expected_val, uint64_t val) __CPU__ __HC__
These functions attempt to perform these three steps atomically:

hc::completion_future::operator=
completion_future & operator=(const completion_future &_Other)
Copy assignment.
Definition: hc.hpp:1170

hc::__pack_s16x4_s32
int64_t __pack_s16x4_s32(int64_t src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...

hc::extent::operator=
extent & operator=(const extent &other) __CPU__ __HC__
Assigns the component values of "other" to this extent<N> object.
Definition: hc.hpp:1657

hc::__unpack_s32_s16x2
int __unpack_s32_s16x2(int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.

hc::array_view< const T, N >::operator()
const T & operator()(int i0) const __CPU__ __HC__
Equivalent to "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:6123

hc::array_view::section
array_view< T, N > section(const index< N > &idx, const extent< N > &ext) const __CPU__ __HC__
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: hc.hpp:5602

hc::array::array
array(int e0, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: hc.hpp:4423

hc::__unpackhi_s16x2
int __unpackhi_s16x2(int src0, int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.

hc::array::array
array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4529

hc
Heterogeneous C++ (HC) namespace.
Definition: grid_launch.h:10

hc::__shfl_xor
int __shfl_xor(int var, int laneMask, int width=__HSA_WAVEFRONT_SIZE__) __HC__
Copy from an active work-item based on bitwise XOR of caller work-item ID within a wavefront...
Definition: hc.hpp:3142

hc::__pack_s16x2_s32
int __pack_s16x2_s32(int src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...

hc::accelerator::get_hsa_am_system_region
void * get_hsa_am_system_region() const
Returns an opaque handle which points to the AM system region on the HSA agent.
Definition: hc.hpp:989

hc::extent::operator++
extent operator++(int) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  1.
Definition: hc.hpp:1851

hc::accelerator::get_all
static std::vector< accelerator > get_all()
Returns a std::vector of accelerator objects (in no specific order) representing all accelerators tha...
Definition: hc.hpp:749

hc::accelerator::get_supports_double_precision
bool get_supports_double_precision() const
Returns a Boolean value indicating whether this accelerator supports double-precision (double) comput...
Definition: hc.hpp:914

hc::__unpack_u32_u16x4
unsigned int __unpack_u32_u16x4(uint64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.

hc::accelerator::get_all_views
std::vector< accelerator_view > get_all_views()
Returns a vector of all accelerator_view associated with this accelerator.
Definition: hc.hpp:960

hc::array::section
array_view< T, 1 > section(int i0, int e0) __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:4989

hc::array_view::copy_to
void copy_to(const array_view &dest) const
Copies the contents of this array_view to the array_view given by "dest", as if by calling "copy(*thi...
Definition: hc.hpp:5335

hc::array::section
array_view< T, N > section(const extent< N > &ext) __CPU__ __HC__
Equivalent to "section(index<N>(), ext)".
Definition: hc.hpp:4968

hc::array::operator()
T & operator()(int i0, int i1, int i2) __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:4855

hc::array::array
array(const extent< N > &ext, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array with the supplied extent, located on the accelerator bound to the accelerator_...
Definition: hc.hpp:4460

hc::array::array
array(int e0, int e1, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: hc.hpp:4323

hc::accelerator::get_is_debug
bool get_is_debug() const
Returns a boolean value indicating whether the accelerator supports debugging.
Definition: hc.hpp:929

hc::__unpackhi_s8x4
int __unpackhi_s8x4(int src0, int src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.

hc::array_view< const T, N >::operator()
const T & operator()(int i0, int i1) const __CPU__ __HC__
Equivalent to "array_view<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: hc.hpp:6128

hc::__unpacklo_u16x4
uint64_t __unpacklo_u16x4(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the lower half of the elements from each source into the desitionation.

hc::array_view< const T, N >::operator=
array_view & operator=(const array_view &other) __CPU__ __HC__
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: hc.hpp:5928

hc::tile_static_memory_fence
void tile_static_memory_fence(const tile_barrier &) __HC__
Establishes a thread-tile scoped memory fence for tile-static (but not global) memory operations...

hc::extent::operator-=
extent & operator-=(int value) __CPU__ __HC__
For a given operator , produces the same effect as (*this) = (*this)  value.
Definition: hc.hpp:1820

hc::accelerator_view::get_max_tile_static_size
size_t get_max_tile_static_size()
Returns the maximum size of tile static area available on this accelerator view.
Definition: hc.hpp:437

hc::__lastbit_u32_s64
unsigned int __lastbit_u32_s64(unsigned long long input) __HC__
Find the first bit set to 1 in a number starting from the least significant bit.
Definition: hc.hpp:2547

hc::__unpackhi_u16x4
uint64_t __unpackhi_u16x4(uint64_t src0, uint64_t src1) __HC__
Copy and interleave the upper half of the elements from each source into the desitionation.

__HSA_WAVEFRONT_SIZE__
#define __HSA_WAVEFRONT_SIZE__
Fetch the size of a wavefront.
Definition: hc.hpp:2373

hc::atomic_fetch_or
uint64_t atomic_fetch_or(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...

hc::__pack_s8x8_s32
int64_t __pack_s8x8_s32(int64_t src0, int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...

hc::tile_barrier::tile_barrier
tile_barrier(const tile_barrier &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:3317

hc::__bitinsert_s64
int64_t __bitinsert_s64(int64_t src0, int64_t src1, unsigned int src2, unsigned int src3) __HC__
Replace a range of bits.

hc::accelerator_view::get_hsa_queue
void * get_hsa_queue()
Returns an opaque handle which points to the underlying HSA queue.
Definition: hc.hpp:468

hc::__unpack_u32_u16x2
unsigned int __unpack_u32_u16x2(unsigned int src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.

hc::tiled_extent< 1 >::set_dynamic_group_segment_size
void set_dynamic_group_segment_size(unsigned int size) __CPU__
Set the size of dynamic group segment.
Definition: hc.hpp:2104

hc::array::array
array(const extent< N > &ext, void *accelerator_pointer)
Constructs an array instance based on the given pointer on the device memory.
Definition: hc.hpp:4392

hc::completion_future::valid
bool valid() const
This method is functionally identical to std::shared_future<void>::valid.
Definition: hc.hpp:1213

hc::array::accelerator_pointer
T * accelerator_pointer() const __CPU__ __HC__
Returns a pointer to the device memory underlying this array.
Definition: hc.hpp:4775

hc::array_view::array_view
array_view(const array_view &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:5272

hc::array::reinterpret_as
array_view< const ElementType, 1 > reinterpret_as() const __CPU__ __HC__
Sometimes it is desirable to view the data of an N-dimensional array as a linear array, possibly with a (unsafe) reinterpretation of the element type.
Definition: hc.hpp:5051

hc::atomic_fetch_dec
unsigned int atomic_fetch_dec(unsigned int *_Dest) __CPU__ __HC__
Atomically increment or decrement the value stored at the location point to by dest.

hc::array::array
array(array &&other)
Move constructor.
Definition: hc.hpp:4253

hc::atomic_fetch_xor
uint64_t atomic_fetch_xor(uint64_t *x, uint64_t y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...

hc::array::value_type
T value_type
The element type of this array.
Definition: hc.hpp:4227

hc::array_view::view_as
array_view< T, K > view_as(extent< K > viewExtent) const __CPU__ __HC__
This member function is similar to "array<T,N>::view_as", although it only supports array_views of ra...
Definition: hc.hpp:5693

hc::__activelaneid_u32
unsigned int __activelaneid_u32() __HC__
Get the count of the number of earlier (in flattened work-item order) active work-items within the sa...

hc::array::section
array_view< const T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const __CPU__ __HC__
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: hc.hpp:5009

hc::__firstbit_u32_s64
unsigned int __firstbit_u32_s64(long long int input) __HC__
Count leading zero bits in the input.
Definition: hc.hpp:2520

hc::array::reinterpret_as
array_view< ElementType, 1 > reinterpret_as() __CPU__ __HC__
Sometimes it is desirable to view the data of an N-dimensional array as a linear array, possibly with a (unsafe) reinterpretation of the element type.
Definition: hc.hpp:5038

hc::tiled_index< 2 >::local
const index< 2 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: hc.hpp:3667

hc::__amdgcn_move_dpp
int __amdgcn_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl)[[hc]]
move DPP intrinsic

hc::__bitalign_b32
unsigned int __bitalign_b32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Align 32 bits within 64 bits of data on an arbitrary bit boundary.

hc::array::operator()
const T & operator()(int i0, int i1, int i2) const __CPU__ __HC__
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: hc.hpp:4872

hc::__unpack_s32_s3x2
int __unpack_s32_s3x2(int64_t src0, unsigned int src1) __HC__
Assign the elements specified by src1 from the packed value in src0.

hc::array_view::synchronize
void synchronize() const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: hc.hpp:5412

hc::tiled_index::global
const index< 3 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: hc.hpp:3462

hc::tiled_index::tile_origin
const index< 3 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: hc.hpp:3480

hc::tiled_extent< 1 >::get_dynamic_group_segment_size
unsigned int get_dynamic_group_segment_size() const __CPU__
Return the size of dynamic group segment in bytes.
Definition: hc.hpp:2111

hc::array::array
array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: hc.hpp:4517

hc::all_memory_fence
void all_memory_fence(const tile_barrier &) __HC__
Establishes a thread-tile scoped memory fence for both global and tile-static memory operations...

hc::accelerator_view::is_hsa_accelerator
bool is_hsa_accelerator()
Returns if the accelerator view is based on HSA.
Definition: hc.hpp:533

hc::array_view::section
array_view< T, 2 > section(int i0, int i1, int e0, int e1) const __CPU__ __HC__
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: hc.hpp:5644

hc::__pack_u8x4_u32
unsigned int __pack_u8x4_u32(unsigned int src0, unsigned int src1, unsigned int src2) __HC__
Assign the elements of the packed value in src0, replacing the element specified by src2 with the val...

hc::accelerator::operator==
bool operator==(const accelerator &other) const
Compares "this" accelerator with the passed accelerator object to determine if they represent the sam...
Definition: hc.hpp:837

hc::completion_future::wait_until
std::future_status wait_until(const std::chrono::time_point< _Clock, _Duration > &_Abs_time) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: hc.hpp:1253

Kalmar::fast_math
Definition: kalmar_math.h:297

hc::__firstbit_u32_u64
unsigned int __firstbit_u32_u64(unsigned long long int input) __HC__
Count leading zero bits in the input.
Definition: hc.hpp:2489

hc::accelerator
Represents a physical accelerated computing device.
Definition: hc.hpp:700

hc::AmPointerInfo
Definition: hc_am.hpp:21

Kalmar::enums
Definition: kalmar_runtime.h:14

hc::accelerator::get_seqnum
int get_seqnum() const
Return the unique integer sequence-number for the accelerator.
Definition: hc.hpp:1095

hc::__atomic_wrapdec
unsigned int __atomic_wrapdec(unsigned int *address, unsigned int val) __HC__
Atomically do the following operations:

hc::__lane_id
int __lane_id(void)[[hc]]
Direct copy from indexed active work-item within a wavefront.
Definition: hc.hpp:2845

hc::array_view< const T, N >::array_view
array_view(const array< T, N > &src) __CPU__ __HC__
Constructs an array_view which is bound to the data contained in the "src" array. ...
Definition: hc.hpp:5796

hc::atomic_fetch_sub
float atomic_fetch_sub(float *x, float y) __CPU__ __HC__
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...

hc::tiled_index< 1 >::tiled_index
tiled_index(const tiled_index &other) __CPU__ __HC__
Copy constructor.
Definition: hc.hpp:3557

hc::array_view< const T, N >::array_view
array_view(const extent< N > &extent, const Container &src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: hc.hpp:5816

hc::__shfl_down
float __shfl_down(float var, const unsigned int delta, const int width=__HSA_WAVEFRONT_SIZE__) __HC__
Copy from an active work-item with higher ID relative to caller within a wavefront.
Definition: hc.hpp:3111

hc::extent::operator+
extent operator+(const index< N > &idx) __CPU__ __HC__
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: hc.hpp:1786

hc::accelerator::get_auto_selection_view
static accelerator_view get_auto_selection_view()
Returns an accelerator_view which when passed as the first argument to a parallel_for_each call cause...
Definition: hc.hpp:789