15 #include "hc_defines.h" 16 #include "kalmar_exception.h" 17 #include "kalmar_index.h" 18 #include "kalmar_runtime.h" 19 #include "kalmar_buffer.h" 20 #include "kalmar_serialize.h" 21 #include "kalmar_launch.h" 22 #include "kalmar_cpu_launch.h" 30 template <
typename T,
int N>
class array;
83 pQueue(other.pQueue) {}
95 pQueue = other.pQueue;
130 unsigned int get_version()
const;
151 void wait() { pQueue->wait(); }
205 return pQueue == other.pQueue;
219 accelerator_view(std::shared_ptr<Kalmar::KalmarQueue> pQueue) : pQueue(pQueue) {}
220 std::shared_ptr<Kalmar::KalmarQueue> pQueue;
223 template<
typename Kernel,
int dim_ext>
friend 224 void Kalmar::mcw_cxxamp_launch_kernel(
const std::shared_ptr<Kalmar::KalmarQueue>&,
size_t *,
size_t *,
const Kernel&);
225 template<
typename Kernel,
int dim_ext>
friend 226 std::shared_future<void>* Kalmar::mcw_cxxamp_launch_kernel_async(
const std::shared_ptr<Kalmar::KalmarQueue>&,
size_t *,
size_t *,
const Kernel&);
228 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 229 template <
typename Kernel,
int N>
friend 230 void launch_cpu_task(
const std::shared_ptr<Kalmar::KalmarQueue>&, Kernel
const&,
extent<N> const&);
233 template <
typename Q,
int K>
friend class array;
234 template <
typename Q,
int K>
friend class array_view;
236 template <
int N,
typename Kernel>
friend 238 template <
int N,
typename Kernel>
friend 240 template <
typename Kernel>
friend 242 template <
typename Kernel>
friend 244 template <
typename Kernel>
friend 247 template <
int D0,
typename Kernel>
friend 249 template <
int D0,
typename Kernel>
friend 252 template <
int D0,
int D1,
typename Kernel>
friend 254 template <
int D0,
int D1,
typename Kernel>
friend 257 template <
int D0,
int D1,
int D2,
typename Kernel>
friend 259 template <
int D0,
int D1,
int D2,
typename Kernel>
friend 262 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 265 __attribute__((annotate(
"user_deserialize")))
267 #if __KALMAR_ACCELERATOR__ != 1 302 static const wchar_t default_accelerator[];
303 static const wchar_t cpu_accelerator[];
335 : pDev(
Kalmar::getContext()->getDevice(path)) {}
354 auto Devices = Kalmar::getContext()->getDevices();
355 std::vector<accelerator> ret(Devices.size());
356 for (std::size_t i = 0; i < ret.size(); ++i)
358 return std::move(ret);
375 return Kalmar::getContext()->set_default(path);
394 return Kalmar::getContext()->auto_select();
428 auto pQueue = pDev->createQueue();
429 pQueue->set_mode(qmode);
472 pDev->set_access(default_cpu_access_type);
553 accelerator(Kalmar::KalmarDevice* pDev) : pDev(pDev) {}
555 Kalmar::KalmarDevice* pDev;
589 : __amp_future(other.__amp_future), __thread_then(other.__thread_then) {}
601 : __amp_future(
std::move(other.__amp_future)), __thread_then(other.__thread_then) {}
612 if (
this != &other) {
613 __amp_future = other.__amp_future;
614 __thread_then = other.__thread_then;
629 if (
this != &other) {
630 __amp_future = std::move(other.__amp_future);
631 __thread_then = other.__thread_then;
653 return __amp_future.valid();
674 template <
class _Rep,
class _Period>
675 std::future_status
wait_for(
const std::chrono::duration<_Rep, _Period>& _Rel_time)
const {
676 return __amp_future.wait_for(_Rel_time);
679 template <
class _Clock,
class _Duration>
680 std::future_status
wait_until(
const std::chrono::time_point<_Clock, _Duration>& _Abs_time)
const {
681 return __amp_future.wait_until(_Abs_time);
691 operator std::shared_future<void>()
const {
705 template<
typename functor>
706 void then(
const functor & func) {
707 #if __KALMAR_ACCELERATOR__ != 1 709 if (__thread_then ==
nullptr) {
711 __thread_then =
new std::thread([&]() restrict(cpu) {
721 if (__thread_then !=
nullptr) {
722 __thread_then->join();
724 delete __thread_then;
725 __thread_then =
nullptr;
729 std::shared_future<void> __amp_future;
730 std::thread* __thread_then =
nullptr;
733 : __amp_future(__future) {}
735 template <
typename T,
int N>
friend 737 template <
typename T,
int N>
friend 739 template <
typename T,
int N>
friend 741 template <
typename T,
int N>
friend 743 template <
typename T,
int N>
friend 746 template <
typename InputIter,
typename T,
int N>
friend 748 template <
typename InputIter,
typename T,
int N>
friend 750 template <
typename InputIter,
typename T,
int N>
friend 752 template <
typename InputIter,
typename T,
int N>
friend 754 template <
typename OutputIter,
typename T,
int N>
friend 756 template <
typename OutputIter,
typename T,
int N>
friend 759 template <
typename T,
int N>
friend class array_view;
791 static const int rank = N;
803 static_assert(N > 0,
"Dimensionality must be positive");
813 : base_(other.base_) {}
824 explicit extent(
int e0) restrict(amp,cpu)
827 template <
typename ..._Tp>
828 explicit extent(_Tp ... __t) restrict(amp,cpu)
830 static_assert(
sizeof...(__t) <= 3,
"Can only supply at most 3 individual coordinates in the constructor");
831 static_assert(
sizeof...(__t) == N,
"rank should be consistency");
844 explicit extent(
const int components[]) restrict(amp,cpu)
845 : base_(components) {}
856 explicit extent(
int components[]) restrict(amp,cpu)
857 : base_(components) {}
867 base_.operator=(other.base_);
878 int operator[] (
unsigned int c)
const restrict(amp,cpu) {
881 int& operator[] (
unsigned int c) restrict(amp,cpu) {
896 return Kalmar::amp_helper<N, index<N>,
extent<N>>::contains(idx, *
this);
904 unsigned int size() const restrict(amp,cpu) {
905 return Kalmar::index_helper<N, extent<N>>::count_size(*
this);
923 static_assert(N == 1,
"One-dimensional tile() method only available on extent<1>");
924 static_assert(D0 >0,
"All tile dimensions must be positive");
927 template <
int D0,
int D1>
929 static_assert(N == 2,
"Two-dimensional tile() method only available on extent<2>");
930 static_assert(D0 >0 && D1 > 0,
"All tile dimensions must be positive");
933 template <
int D0,
int D1,
int D2>
935 static_assert(N == 3,
"Three-dimensional tile() method only available on extent<3>");
936 static_assert(D0 >0 && D1 > 0 && D2 > 0,
"All tile dimensions must be positive");
955 return Kalmar::index_helper<N, extent<N> >::equal(*
this, other);
958 return !(*
this == other);
972 base_.operator+=(__r.base_);
976 base_.operator-=(__r.base_);
982 base_.operator*=(__r.base_);
987 base_.operator/=(__r.base_);
992 base_.operator%=(__r.base_);
1016 base_.operator+=(idx.base_);
1020 base_.operator-=(idx.base_);
1036 base_.operator+=(value);
1040 base_.operator-=(value);
1044 base_.operator*=(value);
1048 base_.operator/=(value);
1052 base_.operator%=(value);
1067 base_.operator+=(1);
1072 base_.operator+=(1);
1076 base_.operator-=(1);
1081 base_.operator-=(1);
1088 template <
int D0,
int D1,
int D2>
1090 : base_(other.base_) {}
1093 typedef Kalmar::index_impl<typename Kalmar::__make_indices<N>::type> base;
1095 template <
int K,
typename Q>
friend struct Kalmar::index_helper;
1096 template <
int K,
typename Q1,
typename Q2>
friend struct Kalmar::amp_helper;
1103 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1104 template <
typename Ker,
typename Ti>
1105 void bar_wrapper(Ker *f, Ti *t)
1111 std::unique_ptr<ucontext_t[]> ctx;
1114 ctx(
new ucontext_t[a + 1]) {}
1115 template <
typename Ti,
typename Ker>
1116 void setctx(
int x,
char *stack, Ker& f, Ti* tidx,
int S) {
1117 getcontext(&ctx[x]);
1118 ctx[x].uc_stack.ss_sp = stack;
1119 ctx[x].uc_stack.ss_size = S;
1120 ctx[x].uc_link = &ctx[x - 1];
1121 makecontext(&ctx[x], (
void (*)(
void))bar_wrapper<Ker, Ti>, 2, &f, tidx);
1123 void swap(
int a,
int b) {
1124 swapcontext(&ctx[a], &ctx[b]);
1128 swapcontext(&ctx[idx + 1], &ctx[idx]);
1133 #ifndef CLK_LOCAL_MEM_FENCE 1134 #define CLK_LOCAL_MEM_FENCE (1) 1137 #ifndef CLK_GLOBAL_MEM_FENCE 1138 #define CLK_GLOBAL_MEM_FENCE (2) 1154 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1155 using pb_t = std::shared_ptr<barrier_t>;
1189 #if __KALMAR_ACCELERATOR__ == 1 1190 wait_with_all_memory_fence();
1191 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1206 #if __KALMAR_ACCELERATOR__ == 1 1207 amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
1208 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1223 #if __KALMAR_ACCELERATOR__ == 1 1224 amp_barrier(CLK_GLOBAL_MEM_FENCE);
1225 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1241 #if __KALMAR_ACCELERATOR__ == 1 1242 amp_barrier(CLK_LOCAL_MEM_FENCE);
1243 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1249 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1256 template<
int D0,
int D1,
int D2>
friend 1300 template <
int D0,
int D1=0,
int D2=0>
1307 static const int rank = 3;
1324 : global(o.global), local(o.local), tile(o.tile), tile_origin(o.tile_origin), barrier(o.barrier) {}
1389 static const int tile_dim0 = D0;
1390 static const int tile_dim1 = D1;
1391 static const int tile_dim2 = D2;
1400 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1401 tiled_index(
int a0,
int a1,
int a2,
int b0,
int b1,
int b2,
1402 int c0,
int c1,
int c2,
tile_barrier& pb) restrict(amp,cpu)
1403 : global(a2, a1, a0), local(b2, b1, b0), tile(c2, c1, c0),
1404 tile_origin(a2 - b2, a1 - b1, a0 - b0), barrier(pb), tile_extent(D0, D1, D2) {}
1407 __attribute__((annotate(
"__cxxamp_opencl_index")))
1408 #if __KALMAR_ACCELERATOR__ == 1 1409 __attribute__((always_inline))
tiled_index() restrict(amp)
1410 : global(
index<3>(amp_get_global_id(2), amp_get_global_id(1), amp_get_global_id(0))),
1411 local(
index<3>(amp_get_local_id(2), amp_get_local_id(1), amp_get_local_id(0))),
1412 tile(
index<3>(amp_get_group_id(2), amp_get_group_id(1), amp_get_group_id(0))),
1413 tile_origin(
index<3>(amp_get_global_id(2)-amp_get_local_id(2),
1414 amp_get_global_id(1)-amp_get_local_id(1),
1415 amp_get_global_id(0)-amp_get_local_id(0))),
1416 tile_extent(D0, D1, D2)
1417 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1418 __attribute__((always_inline))
tiled_index() restrict(amp, cpu)
1420 __attribute__((always_inline))
tiled_index() restrict(amp)
1421 #endif // __KALMAR_ACCELERATOR__ 1424 template<
int D0_,
int D1_,
int D2_,
typename K>
friend 1427 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1428 template<
typename K,
int D1_,
int D2_,
int D3_>
friend 1448 static const int rank = 3;
1465 : global(o.global), local(o.local), tile(o.tile), tile_origin(o.tile_origin), barrier(o.barrier) {}
1530 static const int tile_dim0 = D0;
1538 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1540 : global(a), local(b), tile(c), tile_origin(a - b), barrier(pb), tile_extent(D0) {}
1543 __attribute__((annotate(
"__cxxamp_opencl_index")))
1544 #if __KALMAR_ACCELERATOR__ == 1 1545 __attribute__((always_inline))
tiled_index() restrict(amp)
1546 : global(
index<1>(amp_get_global_id(0))),
1547 local(
index<1>(amp_get_local_id(0))),
1548 tile(
index<1>(amp_get_group_id(0))),
1549 tile_origin(
index<1>(amp_get_global_id(0)-amp_get_local_id(0))),
1551 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1552 __attribute__((always_inline))
tiled_index() restrict(amp,cpu)
1554 __attribute__((always_inline))
tiled_index() restrict(amp)
1555 #endif // __KALMAR_ACCELERATOR__ 1558 template<
int D,
typename K>
friend 1561 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1562 template<
typename K,
int D>
friend 1575 template <
int D0,
int D1>
1582 static const int rank = 2;
1599 : global(o.global), local(o.local), tile(o.tile), tile_origin(o.tile_origin), barrier(o.barrier) {}
1664 static const int tile_dim0 = D0;
1665 static const int tile_dim1 = D1;
1674 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1676 : global(a1, a0), local(b1, b0), tile(c1, c0), tile_origin(a1 - b1, a0 - b0), barrier(tbar), tile_extent(D0, D1) {}
1679 __attribute__((annotate(
"__cxxamp_opencl_index")))
1680 #if __KALMAR_ACCELERATOR__ == 1 1681 __attribute__((always_inline))
tiled_index() restrict(amp)
1682 : global(
index<2>(amp_get_global_id(1), amp_get_global_id(0))),
1683 local(
index<2>(amp_get_local_id(1), amp_get_local_id(0))),
1684 tile(
index<2>(amp_get_group_id(1), amp_get_group_id(0))),
1685 tile_origin(
index<2>(amp_get_global_id(1)-amp_get_local_id(1),
1686 amp_get_global_id(0)-amp_get_local_id(0))),
1688 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1689 __attribute__((always_inline))
tiled_index() restrict(amp,cpu)
1691 __attribute__((always_inline))
tiled_index() restrict(amp)
1692 #endif // __KALMAR_ACCELERATOR__ 1695 template<
int D0_,
int D1_,
typename K>
friend 1698 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 1699 template<
typename K,
int D1_,
int D2_>
friend 1715 template <
int D0,
int D1,
int D2>
1719 static_assert(D0 > 0,
"Tile size must be positive");
1720 static_assert(D1 > 0,
"Tile size must be positive");
1721 static_assert(D2 > 0,
"Tile size must be positive");
1722 static const int rank = 3;
1764 padded[0] = (padded[0] <= D0) ? D0 : (((padded[0] + D0 - 1) / D0) * D0);
1765 padded[1] = (padded[1] <= D1) ? D1 : (((padded[1] + D1 - 1) / D1) * D1);
1766 padded[2] = (padded[2] <= D2) ? D2 : (((padded[2] + D2 - 1) / D2) * D2);
1777 trunc[0] = (trunc[0]/D0) * D0;
1778 trunc[1] = (trunc[1]/D1) * D1;
1779 trunc[2] = (trunc[2]/D2) * D2;
1802 static const int tile_dim0 = D0;
1803 static const int tile_dim1 = D1;
1804 static const int tile_dim2 = D2;
1828 template <
int D0,
int D1>
1832 static_assert(D0 > 0,
"Tile size must be positive");
1833 static_assert(D1 > 0,
"Tile size must be positive");
1834 static const int rank = 2;
1876 padded[0] = (padded[0] <= D0) ? D0 : (((padded[0] + D0 - 1) / D0) * D0);
1877 padded[1] = (padded[1] <= D1) ? D1 : (((padded[1] + D1 - 1) / D1) * D1);
1888 trunc[0] = (trunc[0]/D0) * D0;
1889 trunc[1] = (trunc[1]/D1) * D1;
1912 static const int tile_dim0 = D0;
1913 static const int tile_dim1 = D1;
1941 static_assert(D0 > 0,
"Tile size must be positive");
1942 static const int rank = 1;
1984 padded[0] = (padded[0] <= D0) ? D0 : (((padded[0] + D0 - 1) / D0) * D0);
1995 trunc[0] = (trunc[0]/D0) * D0;
2018 static const int tile_dim0 = D0;
2036 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 2037 #define SSIZE 1024 * 10 2038 template <
int N,
typename Kernel,
int K>
2041 static inline void call(
const Kernel& k,
index<K>& idx,
const extent<K>& ext) restrict(amp,cpu) {
2043 for (i = 0; i < ext[N]; ++i) {
2045 cpu_helper<N + 1, Kernel, K>::call(k, idx, ext);
2049 template <
typename Kernel,
int K>
2050 struct cpu_helper<K, Kernel, K>
2052 static inline void call(
const Kernel& k,
const index<K>& idx,
const extent<K>& ext) restrict(amp,cpu) {
2053 (
const_cast<Kernel&
>(k))(idx);
2057 template <
typename Kernel,
int N>
2058 void partitioned_task(
const Kernel& ker,
const extent<N>& ext,
int part) {
2060 int start = ext[0] * part / Kalmar::NTHREAD;
2061 int end = ext[0] * (part + 1) / Kalmar::NTHREAD;
2062 for (
int i = start; i < end; i++) {
2064 cpu_helper<1, Kernel, N>::call(ker, idx, ext);
2068 template <
typename Kernel,
int D0>
2069 void partitioned_task_tile(Kernel
const& f,
tiled_extent<D0> const& ext,
int part) {
2070 int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
2071 int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
2072 int stride = end - start;
2075 char *stk =
new char[D0 * SSIZE];
2077 tile_barrier::pb_t amp_bar = std::make_shared<barrier_t>(D0);
2079 for (
int tx = start; tx < end; tx++) {
2083 for (
int x = 0; x < D0; x++) {
2085 amp_bar->setctx(++
id, sp, f, tip, SSIZE);
2090 while (amp_bar->idx == 0) {
2092 amp_bar->swap(0,
id);
2098 template <
typename Kernel,
int D0,
int D1>
2100 int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
2101 int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
2102 int stride = end - start;
2105 char *stk =
new char[D1 * D0 * SSIZE];
2107 tile_barrier::pb_t amp_bar = std::make_shared<barrier_t>(D0 * D1);
2110 for (
int tx = 0; tx < ext[1] / D1; tx++)
2111 for (
int ty = start; ty < end; ty++) {
2115 for (
int x = 0; x < D1; x++)
2116 for (
int y = 0; y < D0; y++) {
2118 amp_bar->setctx(++
id, sp, f, tip, SSIZE);
2123 while (amp_bar->idx == 0) {
2125 amp_bar->swap(0,
id);
2132 template <
typename Kernel,
int D0,
int D1,
int D2>
2134 int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
2135 int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
2136 int stride = end - start;
2139 char *stk =
new char[D2 * D1 * D0 * SSIZE];
2141 tile_barrier::pb_t amp_bar = std::make_shared<barrier_t>(D0 * D1 * D2);
2144 for (
int i = 0; i < ext[2] / D2; i++)
2145 for (
int j = 0; j < ext[1] / D1; j++)
2146 for(
int k = start; k < end; k++) {
2150 for (
int x = 0; x < D2; x++)
2151 for (
int y = 0; y < D1; y++)
2152 for (
int z = 0; z < D0; z++) {
2156 x, y, z, i, j, k, tbar);
2157 amp_bar->setctx(++
id, sp, f, tip, SSIZE);
2162 while (amp_bar->idx == 0) {
2164 amp_bar->swap(0,
id);
2171 template <
typename Kernel,
int N>
2172 void launch_cpu_task(
const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel
const& f,
2175 Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
2176 for (
int i = 0; i < Kalmar::NTHREAD; ++i)
2177 obj[i] = std::thread(partitioned_task<Kernel, N>, std::cref(f), std::cref(compute_domain), i);
2180 template <
typename Kernel,
int D0>
2181 void launch_cpu_task(
const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel
const& f,
2184 Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
2185 for (
int i = 0; i < Kalmar::NTHREAD; ++i)
2186 obj[i] = std::thread(partitioned_task_tile<Kernel, D0>,
2187 std::cref(f), std::cref(compute_domain), i);
2190 template <
typename Kernel,
int D0,
int D1>
2191 void launch_cpu_task(
const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel
const& f,
2194 Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
2195 for (
int i = 0; i < Kalmar::NTHREAD; ++i)
2196 obj[i] = std::thread(partitioned_task_tile<Kernel, D0, D1>,
2197 std::cref(f), std::cref(compute_domain), i);
2200 template <
typename Kernel,
int D0,
int D1,
int D2>
2201 void launch_cpu_task(
const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel
const& f,
2204 Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
2205 for (
int i = 0; i < Kalmar::NTHREAD; ++i)
2206 obj[i] = std::thread(partitioned_task_tile<Kernel, D0, D1, D2>,
2207 std::cref(f), std::cref(compute_domain), i);
2216 template <
typename T,
int N>
2221 static_assert(N > 1,
"projection_helper is only supported on array_view with a rank of 2 or higher");
2223 static result_type project(
array_view<T, N>& now,
int stride) restrict(amp,cpu) {
2224 int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
2225 for (i = N - 1; i > 0; --i) {
2226 ext_o[i - 1] = now.extent[i];
2227 ext[i - 1] = now.extent_base[i];
2228 idx[i - 1] = now.index_base[i];
2230 stride += now.index_base[0];
2234 return result_type (now.cache, ext_now, ext_base, idx_base,
2235 now.offset + ext_base.size() * stride);
2237 static result_type project(
const array_view<T, N>& now,
int stride) restrict(amp,cpu) {
2238 int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
2239 for (i = N - 1; i > 0; --i) {
2240 ext_o[i - 1] = now.extent[i];
2241 ext[i - 1] = now.extent_base[i];
2242 idx[i - 1] = now.index_base[i];
2244 stride += now.index_base[0];
2248 return result_type (now.cache, ext_now, ext_base, idx_base,
2249 now.offset + ext_base.size() * stride);
2252 template <
typename T>
2257 typedef T& result_type;
2258 static result_type project(
array_view<T, 1>& now,
int i) restrict(amp,cpu) {
2259 #if __KALMAR_ACCELERATOR__ != 1 2260 now.cache.get_cpu_access(
true);
2262 T *ptr =
reinterpret_cast<T *
>(now.cache.get() + i + now.offset + now.index_base[0]);
2265 static result_type project(
const array_view<T, 1>& now,
int i) restrict(amp,cpu) {
2266 #if __KALMAR_ACCELERATOR__ != 1 2267 now.cache.get_cpu_access(
true);
2269 T *ptr =
reinterpret_cast<T *
>(now.cache.get() + i + now.offset + now.index_base[0]);
2273 template <
typename T,
int N>
2278 static_assert(N > 1,
"projection_helper is only supported on array_view with a rank of 2 or higher");
2281 int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
2282 for (i = N - 1; i > 0; --i) {
2283 ext_o[i - 1] = now.extent[i];
2284 ext[i - 1] = now.extent_base[i];
2285 idx[i - 1] = now.index_base[i];
2287 stride += now.index_base[0];
2291 auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
2292 now.offset + ext_base.size() * stride);
2296 int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
2297 for (i = N - 1; i > 0; --i) {
2298 ext_o[i - 1] = now.extent[i];
2299 ext[i - 1] = now.extent_base[i];
2300 idx[i - 1] = now.index_base[i];
2302 stride += now.index_base[0];
2306 auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
2307 now.offset + ext_base.size() * stride);
2311 template <
typename T>
2316 typedef const T& const_result_type;
2318 #if __KALMAR_ACCELERATOR__ != 1 2319 now.cache.get_cpu_access();
2321 const T *ptr =
reinterpret_cast<const T *
>(now.cache.get() + i + now.offset + now.index_base[0]);
2325 #if __KALMAR_ACCELERATOR__ != 1 2326 now.cache.get_cpu_access();
2328 const T *ptr =
reinterpret_cast<const T *
>(now.cache.get() + i + now.offset + now.index_base[0]);
2337 template <
typename T,
int N>
2343 static_assert(N > 1,
"projection_helper is only supported on array with a rank of 2 or higher");
2346 static result_type project(
array<T, N>& now,
int stride) restrict(amp,cpu) {
2347 #if __KALMAR_ACCELERATOR__ != 1 2352 for (i = N - 1; i > 0; --i)
2353 comp[i - 1] = now.extent[i];
2355 int offset = ext.
size() * stride;
2356 #if __KALMAR_ACCELERATOR__ != 1 2357 if( offset >= now.extent.size())
2360 return result_type(now.m_device, ext, ext,
index<N - 1>(), offset);
2362 static const_result_type project(
const array<T, N>& now,
int stride) restrict(amp,cpu) {
2364 for (i = N - 1; i > 0; --i)
2365 comp[i - 1] = now.extent[i];
2367 int offset = ext.
size() * stride;
2368 return const_result_type(now.m_device, ext, ext,
index<N - 1>(), offset);
2371 template <
typename T>
2377 typedef T& result_type;
2378 typedef const T& const_result_type;
2379 static result_type project(
array<T, 1>& now,
int i) restrict(amp,cpu) {
2380 #if __KALMAR_ACCELERATOR__ != 1 2381 now.m_device.synchronize(
true);
2383 T *ptr =
reinterpret_cast<T *
>(now.m_device.get() + i);
2386 static const_result_type project(
const array<T, 1>& now,
int i) restrict(amp,cpu) {
2387 #if __KALMAR_ACCELERATOR__ != 1 2388 now.m_device.synchronize();
2390 const T *ptr =
reinterpret_cast<const T *
>(now.m_device.get() + i);
2398 #if __KALMAR_ACCELERATOR__ != 1 2399 for (
int i = 0; i < N; i++)
2412 template <
typename T,
int N>
2415 template <
typename T,
int N>
2418 template <
typename T,
int N>
2421 template <
typename T,
int N>
2424 template <
typename T,
int N>
2427 template <
typename T,
int N>
2430 template <
typename InputIter,
typename T,
int N>
2433 template <
typename InputIter,
typename T,
int N>
2436 template <
typename InputIter,
typename T,
int N>
2439 template <
typename InputIter,
typename T,
int N>
2442 template <
typename OutputIter,
typename T,
int N>
2445 template <
typename OutputIter,
typename T,
int N>
2459 template <
typename T,
int N = 1>
2461 static_assert(!std::is_const<T>::value,
"array<const T> is not supported");
2462 static_assert(0 == (
sizeof(T) %
sizeof(
int)),
"only value types whose size is a multiple of the size of an integer are allowed in array");
2464 #if __KALMAR_ACCELERATOR__ == 1 2465 typedef Kalmar::_data<T> acc_buffer_t;
2467 typedef Kalmar::_data_host<T> acc_buffer_t;
2473 static const int rank = N;
2494 :
array(other.get_extent(), other.get_accelerator_view())
2506 { other.m_device.reset(); }
2548 template <
typename InputIter>
2551 template <
typename InputIter>
2553 :
array(ext, srcBegin, srcEnd,
accelerator(L
"default").get_default_view()) {}
2567 template <
typename InputIter>
2570 template <
typename InputIter>
2571 array(
int e0, InputIter srcBegin, InputIter srcEnd)
2573 template <
typename InputIter>
2574 array(
int e0,
int e1, InputIter srcBegin)
2576 template <
typename InputIter>
2577 array(
int e0,
int e1, InputIter srcBegin, InputIter srcEnd)
2579 template <
typename InputIter>
2580 array(
int e0,
int e1,
int e2, InputIter srcBegin)
2582 template <
typename InputIter>
2583 array(
int e0,
int e1,
int e2, InputIter srcBegin, InputIter srcEnd)
2626 #if __KALMAR_ACCELERATOR__ == 1 2629 : m_device(av.pQueue, av.pQueue, check(ext).size(), cpu_access_type),
extent(ext) {}
2679 template <
typename InputIter>
2681 access_type cpu_access_type = access_type_auto)
2683 template <
typename InputIter>
2686 :
array(ext, av, cpu_access_type) {
2687 if(ext.
size() < std::distance(srcBegin, srcEnd))
2736 template <
typename InputIter>
2739 template <
typename InputIter>
2740 array(
int e0, InputIter srcBegin, InputIter srcEnd,
accelerator_view av, access_type cpu_access_type = access_type_auto)
2742 template <
typename InputIter>
2745 template <
typename InputIter>
2746 array(
int e0,
int e1, InputIter srcBegin, InputIter srcEnd,
accelerator_view av, access_type cpu_access_type = access_type_auto)
2748 template <
typename InputIter>
2751 template <
typename InputIter>
2752 array(
int e0,
int e1,
int e2, InputIter srcBegin, InputIter srcEnd,
accelerator_view av, access_type cpu_access_type = access_type_auto)
2770 #if __KALMAR_ACCELERATOR__ == 1 2773 : m_device(av.pQueue, associated_av.pQueue, check(ext).size(), access_type_auto),
extent(ext) {}
2812 template <
typename InputIter>
2815 template <
typename InputIter>
2817 :
array(ext, av, associated_av) {
2818 if(ext.
size() < std::distance(srcBegin, srcEnd))
2842 :
array(src.get_extent(), av, associated_av)
2859 template <
typename InputIter>
2862 template <
typename InputIter>
2865 template <
typename InputIter>
2868 template <
typename InputIter>
2871 template <
typename InputIter>
2874 template <
typename InputIter>
2911 if (
this != &other) {
2913 *
this = std::move(arr);
2926 if (
this != &other) {
2928 m_device = other.m_device;
2929 other.m_device.reset();
2944 *
this = std::move(arr);
2957 #if __KALMAR_ACCELERATOR__ != 1 2958 for(
int i = 0 ; i < N ; i++)
2960 if(dest.extent[i] < this->extent[i] )
2984 T*
data() const restrict(amp,cpu) {
2985 #if __KALMAR_ACCELERATOR__ != 1 2986 if (!m_device.get())
2988 m_device.synchronize(
true);
2990 return reinterpret_cast<T*
>(m_device.get());
3000 operator std::vector<T>()
const {
3003 return std::move(vec);
3018 #ifndef __KALMAR_ACCELERATOR__ 3019 if (!m_device.get())
3021 m_device.synchronize(
true);
3023 T *ptr =
reinterpret_cast<T*
>(m_device.get());
3027 return (*
this)[idx];
3044 #if __KALMAR_ACCELERATOR__ != 1 3045 if (!m_device.get())
3047 m_device.synchronize();
3049 T *ptr =
reinterpret_cast<T*
>(m_device.get());
3053 return (*
this)[idx];
3070 return (*
this)[
index<3>(i0, i1, i2)];
3086 const T&
operator()(
int i0,
int i1,
int i2)
const restrict(amp,cpu) {
3087 return (*
this)[
index<3>(i0, i1, i2)];
3110 operator[] (
int i) restrict(amp,cpu) {
3118 operator[] (
int i)
const restrict(amp,cpu) {
3147 #if __KALMAR_ACCELERATOR__ != 1 3152 return av.
section(origin, ext);
3156 return av.
section(origin, ext);
3166 #if __KALMAR_ACCELERATOR__ != 1 3206 static_assert(N == 1,
"Rank must be 1");
3210 static_assert(N == 1,
"Rank must be 1");
3214 static_assert(N == 2,
"Rank must be 2");
3218 static_assert(N == 2,
"Rank must be 2");
3222 static_assert(N == 3,
"Rank must be 3");
3226 static_assert(N == 3,
"Rank must be 3");
3253 template <
typename ElementType>
3255 #if __KALMAR_ACCELERATOR__ != 1 3256 static_assert( ! (std::is_pointer<ElementType>::value ),
"can't use pointer in the kernel");
3257 static_assert( ! (std::is_same<ElementType,short>::value ),
"can't use short in the kernel");
3258 if( (
extent.
size() *
sizeof(T)) %
sizeof(ElementType))
3261 int size =
extent.
size() *
sizeof(T) /
sizeof(ElementType);
3262 using buffer_type =
typename array_view<ElementType, 1>::acc_buffer_t;
3266 template <
typename ElementType>
3268 #if __KALMAR_ACCELERATOR__ != 1 3269 static_assert( ! (std::is_pointer<ElementType>::value ),
"can't use pointer in the kernel");
3270 static_assert( ! (std::is_same<ElementType,short>::value ),
"can't use short in the kernel");
3272 int size =
extent.
size() *
sizeof(T) /
sizeof(ElementType);
3273 using buffer_type =
typename array_view<ElementType, 1>::acc_buffer_t;
3295 #if __KALMAR_ACCELERATOR__ != 1 3304 #if __KALMAR_ACCELERATOR__ != 1 3317 const acc_buffer_t&
internal()
const restrict(amp,cpu) {
return m_device; }
3318 int get_offset()
const restrict(amp,cpu) {
return 0; }
3323 acc_buffer_t m_device;
3326 template <
typename Q,
int K>
friend 3328 template <
typename Q,
int K>
friend 3336 template <
typename T>
3340 struct two {
char __lx;
char __lxx;};
3341 template <
typename C>
static char test(decltype(std::declval<C>().data()));
3342 template <
typename C>
static two test(...);
3344 static const bool value =
sizeof(test<T>(0)) == 1;
3347 template <
typename T>
3351 struct two {
char __lx;
char __lxx;};
3352 template <
typename C>
static char test(decltype(&C::size));
3353 template <
typename C>
static two test(...);
3355 static const bool value =
sizeof(test<T>(0)) == 1;
3358 template <
typename T>
3361 using _T =
typename std::remove_reference<T>::type;
3375 template <
typename T,
int N = 1>
3378 static_assert(0 == (
sizeof(T) %
sizeof(
int)),
"only value types whose size is a multiple of the size of an integer are allowed in array views");
3380 typedef typename std::remove_const<T>::type nc_T;
3381 #if __KALMAR_ACCELERATOR__ == 1 3382 typedef Kalmar::_data<T> acc_buffer_t;
3384 typedef Kalmar::_data_host<T> acc_buffer_t;
3390 static const int rank = N;
3411 : cache(src.internal()),
extent(src.get_extent()), extent_base(
extent), index_base(), offset(0) {}
3429 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
3432 { static_assert( std::is_same<decltype(src.data()), T*>::value,
"container element type and array view element type must match"); }
3445 #if __KALMAR_ACCELERATOR__ == 1 3446 : cache((T *)(src)),
extent(ext), extent_base(ext), offset(0) {}
3448 : cache(ext.
size(), (T *)(src)),
extent(ext), extent_base(ext), offset(0) {}
3462 : cache(ext.size()),
extent(ext), extent_base(ext), offset(0) {}
3474 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
3477 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
3480 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
3481 array_view(
int e0,
int e1,
int e2, Container& src)
3496 array_view(
int e0,
int e1, value_type *src) restrict(amp,cpu)
3498 array_view(
int e0,
int e1,
int e2, value_type *src) restrict(amp,cpu)
3510 explicit array_view(
int e0,
int e1) restrict(cpu)
3512 explicit array_view(
int e0,
int e1,
int e2) restrict(cpu)
3524 : cache(other.cache),
extent(other.
extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
3550 array_view&
operator=(
const array_view& other) restrict(amp,cpu) {
3551 if (
this != &other) {
3552 cache = other.cache;
3554 index_base = other.index_base;
3555 extent_base = other.extent_base;
3556 offset = other.offset;
3569 #if __KALMAR_ACCELERATOR__ != 1 3570 for(
int i= 0 ;i< N;i++) {
3602 T*
data() const restrict(amp,cpu) {
3603 #if __KALMAR_ACCELERATOR__ != 1 3604 cache.get_cpu_access(
true);
3606 static_assert(N == 1,
"data() is only permissible on array views of rank 1");
3607 return reinterpret_cast<T*
>(cache.get() + offset + index_base[0]);
3665 std::future<void> fut = std::async([&]()
mutable { synchronize(); });
3704 #if __KALMAR_ACCELERATOR__ != 1 3705 cache.sync_to(av.pQueue);
3735 #if __KALMAR_ACCELERATOR__ != 1 3748 T& operator[] (
const index<N>& idx)
const restrict(amp,cpu) {
3749 #if __KALMAR_ACCELERATOR__ != 1 3750 cache.get_cpu_access(
true);
3752 T *ptr =
reinterpret_cast<T*
>(cache.get() + offset);
3753 return ptr[Kalmar::amp_helper<N, index<N>,
Concurrency::extent<N>>::flatten(idx + index_base, extent_base)];
3756 T& operator() (
const index<N>& idx)
const restrict(amp,cpu) {
3757 return (*
this)[idx];
3774 T& get_ref(
const index<N>& idx)
const restrict(amp,cpu);
3784 T& operator() (
int i0,
int i1)
const restrict(amp,cpu) {
3785 static_assert(N == 2,
"T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
3789 T& operator() (
int i0,
int i1,
int i2)
const restrict(amp,cpu) {
3790 static_assert(N == 3,
"T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
3791 return (*
this)[
index<3>(i0, i1, i2)];
3816 operator[] (
int i)
const restrict(amp,cpu) {
3821 operator() (
int i0)
const restrict(amp,cpu) {
return (*
this)[i0]; }
3845 #if __KALMAR_ACCELERATOR__ != 1 3859 return section(idx, ext);
3867 return section(idx, ext);
3881 static_assert(N == 1,
"Rank must be 1");
3886 static_assert(N == 2,
"Rank must be 2");
3891 static_assert(N == 3,
"Rank must be 3");
3908 template <
typename ElementType>
3910 static_assert(N == 1,
"reinterpret_as is only permissible on array views of rank 1");
3911 #if __KALMAR_ACCELERATOR__ != 1 3912 static_assert( ! (std::is_pointer<ElementType>::value ),
"can't use pointer in the kernel");
3913 static_assert( ! (std::is_same<ElementType,short>::value ),
"can't use short in the kernel");
3914 if ( (
extent.
size() *
sizeof(T)) %
sizeof(ElementType))
3917 int size =
extent.
size() *
sizeof(T) /
sizeof(ElementType);
3918 using buffer_type =
typename array_view<ElementType, 1>::acc_buffer_t;
3921 (offset + index_base[0])*
sizeof(T) /
sizeof(ElementType));
3935 static_assert(N == 1,
"view_as is only permissible on array views of rank 1");
3936 #if __KALMAR_ACCELERATOR__ != 1 3944 ~array_view() restrict(amp,cpu) {}
3947 template <
int D0,
int D1=0,
int D2=0>
3949 #if __KALMAR_ACCELERATOR__ != 1 3950 cache.get_cpu_access(
true);
3952 T *ptr =
reinterpret_cast<T*
>(cache.get() + offset);
3956 const acc_buffer_t&
internal()
const restrict(amp,cpu) {
return cache; }
3958 int get_offset()
const restrict(amp,cpu) {
return offset; }
3965 template <
typename Q,
int K>
friend class array;
3966 template <
typename Q,
int K>
friend class array_view;
3968 template<
typename Q,
int K>
friend 3970 template <
typename Q,
int K>
friend 3972 template <
typename InputIter,
typename Q,
int K>
friend 3974 template <
typename Q,
int K>
friend 3976 template <
typename OutputIter,
typename Q,
int K>
friend 3978 template <
typename Q,
int K>
friend 3983 int offset) restrict(amp,cpu)
3984 : cache(cache),
extent(ext), extent_base(ext), offset(offset) {}
3990 : cache(cache),
extent(ext_now), extent_base(ext_b), index_base(idx_b), offset(off) {}
4010 template <
typename T,
int N>
4014 typedef typename std::remove_const<T>::type nc_T;
4016 #if __KALMAR_ACCELERATOR__ == 1 4017 typedef Kalmar::_data<nc_T> acc_buffer_t;
4019 typedef Kalmar::_data_host<const T> acc_buffer_t;
4025 static const int rank = N;
4046 : cache(src.internal()),
extent(src.get_extent()), extent_base(
extent), index_base(), offset(0) {}
4064 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
4067 { static_assert( std::is_same<
typename std::remove_const<
typename std::remove_reference<decltype(*src.data())>::type>::type, T>::value,
"container element type and array view element type must match"); }
4080 #if __KALMAR_ACCELERATOR__ == 1 4081 : cache((nc_T*)(src)),
extent(ext), extent_base(ext), offset(0) {}
4083 : cache(ext.
size(), src),
extent(ext), extent_base(ext), offset(0) {}
4096 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
4098 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
4101 template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
4102 array_view(
int e0,
int e1,
int e2, Container& src)
4117 array_view(
int e0,
int e1,
const value_type *src) restrict(amp,cpu)
4119 array_view(
int e0,
int e1,
int e2,
const value_type *src) restrict(amp,cpu)
4131 : cache(other.cache),
extent(other.
extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
4141 : cache(other.cache),
extent(other.
extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
4169 cache = other.cache;
4171 index_base = other.index_base;
4172 extent_base = other.extent_base;
4173 offset = other.offset;
4177 array_view&
operator=(
const array_view& other) restrict(amp,cpu) {
4178 if (
this != &other) {
4179 cache = other.cache;
4181 index_base = other.index_base;
4182 extent_base = other.extent_base;
4183 offset = other.offset;
4223 const T*
data() const restrict(amp,cpu) {
4224 #if __KALMAR_ACCELERATOR__ != 1 4225 cache.get_cpu_access();
4227 static_assert(N == 1,
"data() is only permissible on array views of rank 1");
4228 return reinterpret_cast<const T*
>(cache.get() + offset + index_base[0]);
4280 std::future<void> fut = std::async([&]()
mutable { synchronize(); });
4296 #if __KALMAR_ACCELERATOR__ != 1 4297 cache.sync_to(av.pQueue);
4327 const T& operator[] (
const index<N>& idx)
const restrict(amp,cpu) {
4328 #if __KALMAR_ACCELERATOR__ != 1 4329 cache.get_cpu_access();
4331 const T *ptr =
reinterpret_cast<const T*
>(cache.get() + offset);
4332 return ptr[Kalmar::amp_helper<N, index<N>,
Concurrency::extent<N>>::flatten(idx + index_base, extent_base)];
4334 const T& operator() (
const index<N>& idx)
const restrict(amp,cpu) {
4335 return (*
this)[idx];
4352 const T& get_ref(
const index<N>& idx)
const restrict(amp,cpu);
4362 const T& operator() (
int i0)
const restrict(amp,cpu) {
4363 static_assert(N == 1,
"const T& array_view::operator()(int) is only permissible on array_view<T, 1>");
4367 const T& operator() (
int i0,
int i1)
const restrict(amp,cpu) {
4368 static_assert(N == 2,
"const T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
4371 const T& operator() (
int i0,
int i1,
int i2)
const restrict(amp,cpu) {
4372 static_assert(N == 3,
"const T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
4373 return (*
this)[
index<3>(i0, i1, i2)];
4398 operator[] (
int i)
const restrict(amp,cpu) {
4438 return section(idx, ext);
4446 return section(idx, ext);
4460 static_assert(N == 1,
"Rank must be 1");
4465 static_assert(N == 2,
"Rank must be 2");
4470 static_assert(N == 3,
"Rank must be 3");
4487 template <
typename ElementType>
4489 static_assert(N == 1,
"reinterpret_as is only permissible on array views of rank 1");
4490 #if __KALMAR_ACCELERATOR__ != 1 4491 static_assert( ! (std::is_pointer<ElementType>::value ),
"can't use pointer in the kernel");
4492 static_assert( ! (std::is_same<ElementType,short>::value ),
"can't use short in the kernel");
4494 int size =
extent.
size() *
sizeof(T) /
sizeof(ElementType);
4495 using buffer_type =
typename array_view<ElementType, 1>::acc_buffer_t;
4498 (offset + index_base[0])*
sizeof(T) /
sizeof(ElementType));
4512 static_assert(N == 1,
"view_as is only permissible on array views of rank 1");
4513 #if __KALMAR_ACCELERATOR__ != 1 4521 ~array_view() restrict(amp,cpu) {}
4524 const acc_buffer_t&
internal()
const restrict(amp,cpu) {
return cache; }
4526 int get_offset()
const restrict(amp,cpu) {
return offset; }
4533 template <
typename Q,
int K>
friend class array;
4534 template <
typename Q,
int K>
friend class array_view;
4536 template<
typename Q,
int K>
friend 4538 template <
typename Q,
int K>
friend 4540 template <
typename InputIter,
typename Q,
int K>
friend 4542 template <
typename Q,
int K>
friend 4544 template <
typename OutputIter,
typename Q,
int K>
friend 4546 template <
typename Q,
int K>
friend 4551 int offset) restrict(amp,cpu)
4552 : cache(cache),
extent(ext), extent_base(ext), offset(offset) {}
4558 : cache(cache),
extent(ext_now), extent_base(ext_b), index_base(idx_b), offset(off) {}
4681 template<
typename T,
int N>
4683 return av.extent == av.extent_base && av.index_base ==
index<N>();
4686 template<
typename T>
4687 static inline bool is_flat(
const array_view<T, 1>& av) noexcept {
return true; }
4689 template <
typename InputIter,
typename T,
int N,
int dim>
4692 void operator()(InputIter& It, T* ptr,
const extent<N>& ext,
4696 for (
int i = dim; i < N; i++)
4698 ptr += stride * idx[dim - 1];
4699 for (
int i = 0; i < ext[dim - 1]; i++) {
4706 template <
typename InputIter,
typename T,
int N>
4709 void operator()(InputIter& It, T* ptr,
const extent<N>& ext,
4713 std::advance(end, ext[N - 1]);
4719 template <
typename OutputIter,
typename T,
int N,
int dim>
4722 void operator()(
const T* ptr, OutputIter& It,
const extent<N>& ext,
4726 for (
int i = dim; i < N; i++)
4728 ptr += stride * idx[dim - 1];
4729 for (
int i = 0; i < ext[dim - 1]; i++) {
4736 template <
typename OutputIter,
typename T,
int N>
4739 void operator()(
const T* ptr, OutputIter& It,
const extent<N>& ext,
4743 It =
std::copy(ptr, ptr + ext[N - 1], It);
4747 template <
typename T,
int N,
int dim>
4750 void operator()(
const T* src, T* dst,
const extent<N>& ext,
4755 for (
int i = dim; i < N; i++)
4756 stride1 *= base1[i];
4757 src += stride1 * idx1[dim - 1];
4760 for (
int i = dim; i < N; i++)
4761 stride2 *= base2[i];
4762 dst += stride2 * idx2[dim - 1];
4764 for (
int i = 0; i < ext[dim - 1]; i++) {
4772 template <
typename T,
int N>
4775 void operator()(
const T* src, T* dst,
const extent<N>& ext,
4785 template <
typename Iter,
typename T,
int N>
4788 template<
template <
typename,
int>
class _amp_container>
4789 void operator()(Iter srcBegin, Iter srcEnd,
const _amp_container<T, N>& dest) {
4790 size_t size = dest.get_extent().size();
4791 size_t offset = dest.get_offset();
4794 T* ptr = dest.internal().map_ptr(modify, size, offset);
4796 dest.internal().unmap_ptr(ptr, modify, size, offset);
4798 template<
template <
typename,
int>
class _amp_container>
4799 void operator()(
const _amp_container<T, N> &src, Iter destBegin) {
4800 size_t size = src.get_extent().size();
4801 size_t offset = src.get_offset();
4802 bool modify =
false;
4804 const T* ptr = src.internal().map_ptr(modify, size, offset);
4805 std::copy(ptr, ptr + src.get_extent().size(), destBegin);
4806 src.internal().unmap_ptr(ptr, modify, size, offset);
4810 template <
typename Iter,
typename T>
4813 template<
template <
typename,
int>
class _amp_container>
4814 void operator()(Iter srcBegin, Iter srcEnd,
const _amp_container<T, 1>& dest) {
4815 size_t size = dest.get_extent().size();
4816 size_t offset = dest.get_offset() + dest.get_index_base()[0];
4819 T* ptr = dest.internal().map_ptr(modify, size, offset);
4821 dest.internal().unmap_ptr(ptr, modify, size, offset);
4823 template<
template <
typename,
int>
class _amp_container>
4824 void operator()(
const _amp_container<T, 1> &src, Iter destBegin) {
4825 size_t size = src.get_extent().size();
4826 size_t offset = src.get_offset() + src.get_index_base()[0];
4827 bool modify =
false;
4829 const T* ptr = src.internal().map_ptr(modify, size, offset);
4830 std::copy(ptr, ptr + src.get_extent().size(), destBegin);
4831 src.internal().unmap_ptr(ptr, modify, size, offset);
4835 template <
typename T,
int N>
4838 template<
template <
typename,
int>
class _amp_container>
4839 void operator()(T* srcBegin, T* srcEnd,
const _amp_container<T, N>& dest) {
4840 dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd), dest.get_offset(),
true);
4842 template<
template <
typename,
int>
class _amp_container>
4843 void operator()(
const _amp_container<T, N> &src, T* destBegin) {
4844 src.internal().read(destBegin, src.get_extent().size(), src.get_offset());
4848 template <
typename T>
4851 template<
template <
typename,
int>
class _amp_container>
4852 void operator()(
const T* srcBegin,
const T* srcEnd,
const _amp_container<T, 1>& dest) {
4853 dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd),
4854 dest.get_offset() + dest.get_index_base()[0],
true);
4856 template<
template <
typename,
int>
class _amp_container>
4857 void operator()(
const _amp_container<T, 1> &src, T* destBegin) {
4858 src.internal().read(destBegin, src.get_extent().size(),
4859 src.get_offset() + src.get_index_base()[0]);
4875 template <
typename T,
int N>
4877 src.internal().copy(dest.internal(), 0, 0, 0);
4888 template <
typename T,
int N>
4891 src.internal().copy(dest.internal(), src.get_offset(),
4892 dest.get_offset(), dest.
get_extent().size());
4895 size_t srcSize = src.extent.size();
4896 size_t srcOffset = 0;
4897 bool srcModify =
false;
4898 size_t destSize = dest.extent_base.size();
4899 size_t destOffset = dest.offset;
4900 bool destModify =
true;
4902 T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
4904 T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
4906 dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
4907 src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
4911 template <
typename T>
4913 src.internal().copy(dest.internal(),
4914 src.get_offset() + src.get_index_base()[0],
4915 dest.get_offset() + dest.get_index_base()[0],
4930 template <
typename T,
int N>
4933 src.internal().copy(dest.internal(), src.get_offset(),
4934 dest.get_offset(), dest.
get_extent().size());
4937 size_t srcSize = src.extent_base.size();
4938 size_t srcOffset = src.offset;
4939 bool srcModify =
false;
4940 size_t destSize = dest.extent.size();
4941 size_t destOffset = 0;
4942 bool destModify =
true;
4944 T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
4946 const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
4948 src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
4949 dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
4953 template <
typename T,
int N>
4959 template <
typename T>
4961 src.internal().copy(dest.internal(),
4962 src.get_offset() + src.get_index_base()[0],
4963 dest.get_offset() + dest.get_index_base()[0],
4978 template <
typename T,
int N>
4982 src.internal().copy(dest.internal(), src.get_offset(),
4983 dest.get_offset(), dest.
get_extent().size());
4986 size_t srcSize = src.extent.size();
4987 size_t srcOffset = 0;
4988 bool srcModify =
false;
4989 size_t destSize = dest.extent_base.size();
4990 size_t destOffset = dest.offset;
4991 bool destModify =
true;
4993 const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
4995 T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
4997 dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
4998 src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
5001 if (is_flat(dest)) {
5003 size_t srcSize = src.extent_base.size();
5004 size_t srcOffset = src.offset;
5005 bool srcModify =
false;
5006 size_t destSize = dest.extent.size();
5007 size_t destOffset = 0;
5008 bool destModify =
true;
5010 T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
5012 const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
5014 dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
5015 src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
5018 size_t srcSize = src.extent_base.size();
5019 size_t srcOffset = src.offset;
5020 bool srcModify =
false;
5021 size_t destSize = dest.extent_base.size();
5022 size_t destOffset = dest.offset;
5023 bool destModify =
true;
5025 const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
5026 T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
5028 src.index_base, dest.extent_base, dest.index_base);
5029 dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
5030 src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
5035 template <
typename T,
int N>
5041 template <
typename T>
5043 src.internal().copy(dest.internal(),
5044 src.get_offset() + src.get_index_base()[0],
5045 dest.get_offset() + dest.get_index_base()[0],
5065 template <
typename InputIter,
typename T,
int N>
5067 #if __KALMAR_ACCELERATOR__ != 1 5068 if( ( std::distance(srcBegin,srcEnd) <=0 )||( std::distance(srcBegin,srcEnd) < dest.
get_extent().size() ))
5074 template <
typename InputIter,
typename T,
int N>
5076 InputIter srcEnd = srcBegin;
5077 std::advance(srcEnd, dest.
get_extent().size());
5097 template <
typename InputIter,
typename T,
int N>
5102 size_t size = dest.extent_base.size();
5103 size_t offset = dest.offset;
5106 T* ptr = dest.internal().map_ptr(modify, size, offset);
5108 dest.internal().unmap_ptr(ptr, modify, size, offset);
5112 template <
typename InputIter,
typename T,
int N>
5114 InputIter srcEnd = srcBegin;
5115 std::advance(srcEnd, dest.
get_extent().size());
5116 copy(srcBegin, srcEnd, dest);
5131 template <
typename OutputIter,
typename T,
int N>
5146 template <
typename OutputIter,
typename T,
int N>
5151 size_t size = src.extent_base.size();
5152 size_t offset = src.offset;
5153 bool modify =
false;
5155 T* ptr = src.internal().map_ptr(modify, size, offset);
5157 src.internal().unmap_ptr(ptr, modify, size, offset);
5177 template <
typename T,
int N>
5179 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
5190 template <
typename T,
int N>
5192 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
5205 template <
typename T,
int N>
5207 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
5211 template <
typename T,
int N>
5213 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
5228 template <
typename T,
int N>
5230 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
5234 template <
typename T,
int N>
5236 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
5256 template <
typename InputIter,
typename T,
int N>
5258 std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]()
mutable {
copy(srcBegin, srcEnd, dest); });
5262 template <
typename InputIter,
typename T,
int N>
5264 std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]()
mutable {
copy(srcBegin, dest); });
5284 template <
typename InputIter,
typename T,
int N>
5286 std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]()
mutable {
copy(srcBegin, srcEnd, dest); });
5290 template <
typename InputIter,
typename T,
int N>
5292 std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]()
mutable {
copy(srcBegin, dest); });
5308 template <
typename OutputIter,
typename T,
int N>
5310 std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]()
mutable {
copy(src, destBegin); });
5324 template <
typename OutputIter,
typename T,
int N>
5326 std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]()
mutable {
copy(src, destBegin); });
5331 template <
typename T,
int N>
5333 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
5337 template <
typename T,
int N>
5339 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
5343 template <
typename T,
int N>
5345 std::future<void> fut = std::async(std::launch::deferred, [&]()
mutable {
copy(src, dest); });
5368 #if __KALMAR_ACCELERATOR__ == 1 5369 extern "C" unsigned int atomic_exchange_unsigned(
unsigned int *p,
unsigned int val) restrict(amp);
5370 extern "C" int atomic_exchange_int(
int *p,
int val) restrict(amp);
5371 extern "C" float atomic_exchange_float(
float *p,
float val) restrict(amp);
5373 static inline unsigned int atomic_exchange(
unsigned int * dest,
unsigned int val) restrict(amp,cpu) {
5374 return atomic_exchange_unsigned(dest, val);
5376 static inline int atomic_exchange(
int * dest,
int val) restrict(amp,cpu) {
5377 return atomic_exchange_int(dest, val);
5379 static inline float atomic_exchange(
float * dest,
float val) restrict(amp,cpu) {
5380 return atomic_exchange_float(dest, val);
5382 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 5383 unsigned int atomic_exchange_unsigned(
unsigned int *p,
unsigned int val);
5384 int atomic_exchange_int(
int *p,
int val);
5385 float atomic_exchange_float(
float *p,
float val);
5387 static inline unsigned int atomic_exchange(
unsigned int *dest,
unsigned int val) restrict(amp,cpu) {
5388 return atomic_exchange_unsigned(dest, val);
5390 static inline int atomic_exchange(
int *dest,
int val) restrict(amp,cpu) {
5391 return atomic_exchange_int(dest, val);
5393 static inline float atomic_exchange(
float *dest,
float val) restrict(amp,cpu) {
5394 return atomic_exchange_float(dest, val);
5397 extern unsigned int atomic_exchange(
unsigned int *dest,
unsigned int val) restrict(amp,cpu);
5399 extern float atomic_exchange(
float *dest,
float val) restrict(amp, cpu);
5434 #if __KALMAR_ACCELERATOR__ == 1 5435 extern "C" unsigned int atomic_compare_exchange_unsigned(
unsigned int *dest,
unsigned int expected_val,
unsigned int val) restrict(amp);
5436 extern "C" int atomic_compare_exchange_int(
int *dest,
int expected_val,
int val) restrict(amp);
5438 static inline bool atomic_compare_exchange(
unsigned int *dest,
unsigned int *expected_val,
unsigned int val) restrict(amp,cpu) {
5439 *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
5440 return (*dest == val);
5443 *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
5444 return (*dest == val);
5446 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 5447 unsigned int atomic_compare_exchange_unsigned(
unsigned int *dest,
unsigned int expected_val,
unsigned int val);
5448 int atomic_compare_exchange_int(
int *dest,
int expected_val,
int val);
5450 static inline bool atomic_compare_exchange(
unsigned int *dest,
unsigned int *expected_val,
unsigned int val) restrict(amp,cpu) {
5451 *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
5452 return (*dest == val);
5455 *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
5456 return (*dest == val);
5459 extern unsigned int atomic_compare_exchange(
unsigned int *dest,
unsigned int *expected_val,
unsigned int val) restrict(amp,cpu);
5493 #if __KALMAR_ACCELERATOR__ == 1 5494 extern "C" unsigned int atomic_add_unsigned(
unsigned int *p,
unsigned int val) restrict(amp);
5495 extern "C" int atomic_add_int(
int *p,
int val) restrict(amp);
5496 extern "C" float atomic_add_float(
float *p,
float val) restrict(amp);
5498 static inline unsigned int atomic_fetch_add(
unsigned int *x,
unsigned int y) restrict(amp,cpu) {
5499 return atomic_add_unsigned(x, y);
5502 return atomic_add_int(x, y);
5504 static inline float atomic_fetch_add(
float *x,
float y) restrict(amp,cpu) {
5505 return atomic_add_float(x, y);
5508 extern "C" unsigned int atomic_sub_unsigned(
unsigned int *p,
unsigned int val) restrict(amp);
5509 extern "C" int atomic_sub_int(
int *p,
int val) restrict(amp);
5510 extern "C" float atomic_sub_float(
float *p,
float val) restrict(amp);
5512 static inline unsigned int atomic_fetch_sub(
unsigned int *x,
unsigned int y) restrict(amp,cpu) {
5513 return atomic_sub_unsigned(x, y);
5516 return atomic_sub_int(x, y);
5519 return atomic_sub_float(x, y);
5522 extern "C" unsigned int atomic_and_unsigned(
unsigned int *p,
unsigned int val) restrict(amp);
5523 extern "C" int atomic_and_int(
int *p,
int val) restrict(amp);
5525 static inline unsigned int atomic_fetch_and(
unsigned int *x,
unsigned int y) restrict(amp,cpu) {
5526 return atomic_and_unsigned(x, y);
5529 return atomic_and_int(x, y);
5532 extern "C" unsigned int atomic_or_unsigned(
unsigned int *p,
unsigned int val) restrict(amp);
5533 extern "C" int atomic_or_int(
int *p,
int val) restrict(amp);
5535 static inline unsigned int atomic_fetch_or(
unsigned int *x,
unsigned int y) restrict(amp,cpu) {
5536 return atomic_or_unsigned(x, y);
5539 return atomic_or_int(x, y);
5542 extern "C" unsigned int atomic_xor_unsigned(
unsigned int *p,
unsigned int val) restrict(amp);
5543 extern "C" int atomic_xor_int(
int *p,
int val) restrict(amp);
5545 static inline unsigned int atomic_fetch_xor(
unsigned int *x,
unsigned int y) restrict(amp,cpu) {
5546 return atomic_xor_unsigned(x, y);
5549 return atomic_xor_int(x, y);
5551 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 5552 unsigned int atomic_add_unsigned(
unsigned int *p,
unsigned int val);
5553 int atomic_add_int(
int *p,
int val);
5554 float atomic_add_float(
float *p,
float val);
5556 static inline unsigned int atomic_fetch_add(
unsigned int *x,
unsigned int y) restrict(amp,cpu) {
5557 return atomic_add_unsigned(x, y);
5560 return atomic_add_int(x, y);
5562 static inline float atomic_fetch_add(
float *x,
float y) restrict(amp,cpu) {
5563 return atomic_add_float(x, y);
5566 unsigned int atomic_sub_unsigned(
unsigned int *p,
unsigned int val);
5567 int atomic_sub_int(
int *p,
int val);
5568 float atomic_sub_float(
float *p,
float val);
5570 static inline unsigned int atomic_fetch_sub(
unsigned int *x,
unsigned int y) restrict(amp,cpu) {
5571 return atomic_sub_unsigned(x, y);
5574 return atomic_sub_int(x, y);
5576 static inline float atomic_fetch_sub(
float *x,
float y) restrict(amp,cpu) {
5577 return atomic_sub_float(x, y);
5580 unsigned int atomic_and_unsigned(
unsigned int *p,
unsigned int val);
5581 int atomic_and_int(
int *p,
int val);
5583 static inline unsigned int atomic_fetch_and(
unsigned int *x,
unsigned int y) restrict(amp,cpu) {
5584 return atomic_and_unsigned(x, y);
5587 return atomic_and_int(x, y);
5590 unsigned int atomic_or_unsigned(
unsigned int *p,
unsigned int val);
5591 int atomic_or_int(
int *p,
int val);
5593 static inline unsigned int atomic_fetch_or(
unsigned int *x,
unsigned int y) restrict(amp,cpu) {
5594 return atomic_or_unsigned(x, y);
5597 return atomic_or_int(x, y);
5600 unsigned int atomic_xor_unsigned(
unsigned int *p,
unsigned int val);
5601 int atomic_xor_int(
int *p,
int val);
5603 static inline unsigned int atomic_fetch_xor(
unsigned int *x,
unsigned int y) restrict(amp,cpu) {
5604 return atomic_xor_unsigned(x, y);
5607 return atomic_xor_int(x, y);
5610 extern unsigned atomic_fetch_add(
unsigned *x,
unsigned y) restrict(amp,cpu);
5614 extern unsigned atomic_fetch_sub(
unsigned *x,
unsigned y) restrict(amp,cpu);
5618 extern unsigned atomic_fetch_and(
unsigned *x,
unsigned y) restrict(amp,cpu);
5621 extern unsigned atomic_fetch_or(
unsigned *x,
unsigned y) restrict(amp,cpu);
5624 extern unsigned atomic_fetch_xor(
unsigned *x,
unsigned y) restrict(amp,cpu);
5628 #if __KALMAR_ACCELERATOR__ == 1 5629 extern "C" unsigned int atomic_max_unsigned(
unsigned int *p,
unsigned int val) restrict(amp);
5630 extern "C" int atomic_max_int(
int *p,
int val) restrict(amp);
5632 static inline unsigned int atomic_fetch_max(
unsigned int *x,
unsigned int y) restrict(amp) {
5633 return atomic_max_unsigned(x, y);
5636 return atomic_max_int(x, y);
5639 extern "C" unsigned int atomic_min_unsigned(
unsigned int *p,
unsigned int val) restrict(amp);
5640 extern "C" int atomic_min_int(
int *p,
int val) restrict(amp);
5642 static inline unsigned int atomic_fetch_min(
unsigned int *x,
unsigned int y) restrict(amp) {
5643 return atomic_min_unsigned(x, y);
5646 return atomic_min_int(x, y);
5648 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 5649 unsigned int atomic_max_unsigned(
unsigned int *p,
unsigned int val);
5650 int atomic_max_int(
int *p,
int val);
5652 static inline unsigned int atomic_fetch_max(
unsigned int *x,
unsigned int y) restrict(amp) {
5653 return atomic_max_unsigned(x, y);
5656 return atomic_max_int(x, y);
5659 unsigned int atomic_min_unsigned(
unsigned int *p,
unsigned int val);
5660 int atomic_min_int(
int *p,
int val);
5662 static inline unsigned int atomic_fetch_min(
unsigned int *x,
unsigned int y) restrict(amp) {
5663 return atomic_min_unsigned(x, y);
5666 return atomic_min_int(x, y);
5670 extern unsigned int atomic_fetch_max(
unsigned int * dest,
unsigned int val) restrict(amp, cpu);
5673 extern unsigned int atomic_fetch_min(
unsigned int * dest,
unsigned int val) restrict(amp, cpu);
5691 #if __KALMAR_ACCELERATOR__ == 1 5692 extern "C" unsigned int atomic_inc_unsigned(
unsigned int *p) restrict(amp);
5693 extern "C" int atomic_inc_int(
int *p) restrict(amp);
5695 static inline unsigned int atomic_fetch_inc(
unsigned int *x) restrict(amp,cpu) {
5696 return atomic_inc_unsigned(x);
5699 return atomic_inc_int(x);
5702 extern "C" unsigned int atomic_dec_unsigned(
unsigned int *p) restrict(amp);
5703 extern "C" int atomic_dec_int(
int *p) restrict(amp);
5705 static inline unsigned int atomic_fetch_dec(
unsigned int *x) restrict(amp,cpu) {
5706 return atomic_dec_unsigned(x);
5709 return atomic_dec_int(x);
5711 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 5712 unsigned int atomic_inc_unsigned(
unsigned int *p);
5713 int atomic_inc_int(
int *p);
5716 return atomic_inc_unsigned(x);
5719 return atomic_inc_int(x);
5722 unsigned int atomic_dec_unsigned(
unsigned int *p);
5723 int atomic_dec_int(
int *p);
5726 return atomic_dec_unsigned(x);
5729 return atomic_dec_int(x);
5733 extern unsigned int atomic_fetch_inc(
unsigned int * _Dest) restrict(amp, cpu);
5736 extern unsigned int atomic_fetch_dec(
unsigned int * _Dest) restrict(amp, cpu);
5745 template <
int N,
typename Kernel>
5748 template <
int D0,
int D1,
int D2,
typename Kernel>
5752 template <
int D0,
int D1,
typename Kernel>
5756 template <
int D0,
typename Kernel>
5760 template <
int N,
typename Kernel>
5761 void parallel_for_each(
extent<N> compute_domain,
const Kernel& f){
5762 auto que = Kalmar::get_availabe_que(f);
5764 parallel_for_each(av, compute_domain, f);
5767 template <
int D0,
int D1,
int D2,
typename Kernel>
5769 auto que = Kalmar::get_availabe_que(f);
5771 parallel_for_each(av, compute_domain, f);
5774 template <
int D0,
int D1,
typename Kernel>
5776 auto que = Kalmar::get_availabe_que(f);
5778 parallel_for_each(av, compute_domain, f);
5781 template <
int D0,
typename Kernel>
5783 auto que = Kalmar::get_availabe_que(f);
5785 parallel_for_each(av, compute_domain, f);
5788 template <
int N,
typename Kernel,
typename _Tp>
5791 static inline void call(Kernel& k, _Tp& idx) restrict(amp,cpu) {
5793 for (i = 0; i < k.ext[N - 1]; ++i) {
5799 template <
typename Kernel,
typename _Tp>
5802 static inline void call(Kernel& k, _Tp& idx) restrict(amp,cpu) {
5803 #if __KALMAR_ACCELERATOR__ == 1 5809 template <
int N,
typename Kernel>
5814 : ext(other), k(f) {}
5815 void operator() (
index<N> idx) restrict(amp,cpu) {
5821 template <
int K,
typename Ker,
typename _Tp>
5825 #pragma clang diagnostic push 5826 #pragma clang diagnostic ignored "-Wunused-variable" 5827 template <
int N,
typename Kernel>
5828 __attribute__((noinline,used))
5830 const Kernel& f) restrict(cpu, amp) {
5831 #if __KALMAR_ACCELERATOR__ != 1 5832 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 5833 int* foo1 =
reinterpret_cast<int*
>(&Kernel::__cxxamp_trampoline);
5838 size_t compute_domain_size = 1;
5839 for(
int i = 0 ; i < N ; i++)
5841 if(compute_domain[i]<=0)
5843 if (static_cast<size_t>(compute_domain[i]) > 4294967295L)
5845 compute_domain_size *=
static_cast<size_t>(compute_domain[i]);
5846 if (compute_domain_size > 4294967295L)
5850 size_t ext[3] = {
static_cast<size_t>(compute_domain[N - 1]),
5851 static_cast<size_t>(compute_domain[N - 2]),
5852 static_cast<size_t>(compute_domain[N - 3])};
5853 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 5855 launch_cpu_task(av.pQueue, f, compute_domain);
5863 Kalmar::mcw_cxxamp_launch_kernel<pfe_wrapper<N, Kernel>, 3>(av.pQueue, ext, NULL, _pf);
5865 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 5866 int* foo1 =
reinterpret_cast<int*
>(&Kernel::__cxxamp_trampoline);
5873 #pragma clang diagnostic pop 5875 #pragma clang diagnostic push 5876 #pragma clang diagnostic ignored "-Wunused-variable" 5878 template <
typename Kernel>
5879 __attribute__((noinline,used))
void parallel_for_each(
const accelerator_view& av,
5880 extent<1> compute_domain,
const Kernel& f) restrict(cpu,amp) {
5881 #if __KALMAR_ACCELERATOR__ != 1 5882 if(compute_domain[0]<=0) {
5885 if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
5887 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 5889 launch_cpu_task(av.pQueue, f, compute_domain);
5893 size_t ext = compute_domain[0];
5897 Kalmar::mcw_cxxamp_launch_kernel<Kernel, 1>(av.pQueue, &ext, NULL, f);
5898 #else //if __KALMAR_ACCELERATOR__ != 1 5901 auto foo = &Kernel::__cxxamp_trampoline;
5902 auto bar = &Kernel::operator();
5905 #pragma clang diagnostic pop 5907 #pragma clang diagnostic push 5908 #pragma clang diagnostic ignored "-Wunused-variable" 5910 template <
typename Kernel>
5911 __attribute__((noinline,used))
void parallel_for_each(
const accelerator_view& av,
5912 extent<2> compute_domain,
const Kernel& f) restrict(cpu,amp) {
5913 #if __KALMAR_ACCELERATOR__ != 1 5914 if(compute_domain[0]<=0 || compute_domain[1]<=0) {
5917 if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[1]) > 4294967295L)
5919 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 5921 launch_cpu_task(av.pQueue, f, compute_domain);
5925 size_t ext[2] = {
static_cast<size_t>(compute_domain[1]),
5926 static_cast<size_t>(compute_domain[0])};
5930 Kalmar::mcw_cxxamp_launch_kernel<Kernel, 2>(av.pQueue, ext, NULL, f);
5931 #else //if __KALMAR_ACCELERATOR__ != 1 5934 auto foo = &Kernel::__cxxamp_trampoline;
5935 auto bar = &Kernel::operator();
5938 #pragma clang diagnostic pop 5940 #pragma clang diagnostic push 5941 #pragma clang diagnostic ignored "-Wunused-variable" 5943 template <
typename Kernel>
5944 __attribute__((noinline,used))
void parallel_for_each(
const accelerator_view& av,
5945 extent<3> compute_domain,
const Kernel& f) restrict(cpu,amp) {
5946 #if __KALMAR_ACCELERATOR__ != 1 5947 if(compute_domain[0]<=0 || compute_domain[1]<=0 || compute_domain[2]<=0) {
5950 if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[1]) > 4294967295L)
5952 if (static_cast<size_t>(compute_domain[1]) * static_cast<size_t>(compute_domain[2]) > 4294967295L)
5954 if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[2]) > 4294967295L)
5956 if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[1]) * static_cast<size_t>(compute_domain[2]) > 4294967295L)
5958 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 5960 launch_cpu_task(av.pQueue, f, compute_domain);
5964 size_t ext[3] = {
static_cast<size_t>(compute_domain[2]),
5965 static_cast<size_t>(compute_domain[1]),
5966 static_cast<size_t>(compute_domain[0])};
5970 Kalmar::mcw_cxxamp_launch_kernel<Kernel, 3>(av.pQueue, ext, NULL, f);
5971 #else //if __KALMAR_ACCELERATOR__ != 1 5974 auto foo = &Kernel::__cxxamp_trampoline;
5975 auto bar = &Kernel::operator();
5978 #pragma clang diagnostic pop 5980 #pragma clang diagnostic push 5981 #pragma clang diagnostic ignored "-Wunused-variable" 5983 template <
int D0,
typename Kernel>
5984 __attribute__((noinline,used))
void parallel_for_each(
const accelerator_view& av,
5986 #if __KALMAR_ACCELERATOR__ != 1 5987 if(compute_domain[0]<=0) {
5990 if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
5992 size_t ext = compute_domain[0];
5994 static_assert( compute_domain.
tile_dim0 <= 1024,
"The maximum nuimber of threads in a tile is 1024");
5995 if(ext % tile != 0) {
5998 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 6000 launch_cpu_task(av.pQueue, f, compute_domain);
6006 Kalmar::mcw_cxxamp_launch_kernel<Kernel, 1>(av.pQueue, &ext, &tile, f);
6007 #else //if __KALMAR_ACCELERATOR__ != 1 6011 auto foo = &Kernel::__cxxamp_trampoline;
6012 auto bar = &Kernel::operator();
6015 #pragma clang diagnostic pop 6017 #pragma clang diagnostic push 6018 #pragma clang diagnostic ignored "-Wunused-variable" 6020 template <
int D0,
int D1,
typename Kernel>
6021 __attribute__((noinline,used))
void parallel_for_each(
const accelerator_view& av,
6023 #if __KALMAR_ACCELERATOR__ != 1 6024 if(compute_domain[0]<=0 || compute_domain[1]<=0) {
6027 if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[1]) > 4294967295L)
6029 size_t ext[2] = {
static_cast<size_t>(compute_domain[1]),
6030 static_cast<size_t>(compute_domain[0])};
6031 size_t tile[2] = { compute_domain.
tile_dim1,
6033 static_assert( (compute_domain.
tile_dim1 * compute_domain.
tile_dim0)<= 1024,
"The maximum nuimber of threads in a tile is 1024");
6034 if((ext[0] % tile[0] != 0) || (ext[1] % tile[1] != 0)) {
6037 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 6039 launch_cpu_task(av.pQueue, f, compute_domain);
6045 Kalmar::mcw_cxxamp_launch_kernel<Kernel, 2>(av.pQueue, ext, tile, f);
6046 #else //if __KALMAR_ACCELERATOR__ != 1 6050 auto foo = &Kernel::__cxxamp_trampoline;
6051 auto bar = &Kernel::operator();
6054 #pragma clang diagnostic pop 6056 #pragma clang diagnostic push 6057 #pragma clang diagnostic ignored "-Wunused-variable" 6059 template <
int D0,
int D1,
int D2,
typename Kernel>
6060 __attribute__((noinline,used))
void parallel_for_each(
const accelerator_view& av,
6062 #if __KALMAR_ACCELERATOR__ != 1 6063 if(compute_domain[0]<=0 || compute_domain[1]<=0 || compute_domain[2]<=0) {
6066 if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[1]) > 4294967295L)
6068 if (static_cast<size_t>(compute_domain[1]) * static_cast<size_t>(compute_domain[2]) > 4294967295L)
6070 if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[2]) > 4294967295L)
6072 if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[1]) * static_cast<size_t>(compute_domain[2]) > 4294967295L)
6074 size_t ext[3] = {
static_cast<size_t>(compute_domain[2]),
6075 static_cast<size_t>(compute_domain[1]),
6076 static_cast<size_t>(compute_domain[0])};
6077 size_t tile[3] = { compute_domain.
tile_dim2,
6080 static_assert(( compute_domain.
tile_dim2 * compute_domain.
tile_dim1* compute_domain.
tile_dim0)<= 1024,
"The maximum nuimber of threads in a tile is 1024");
6081 if((ext[0] % tile[0] != 0) || (ext[1] % tile[1] != 0) || (ext[2] % tile[2] != 0)) {
6084 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 6086 launch_cpu_task(av.pQueue, f, compute_domain);
6092 Kalmar::mcw_cxxamp_launch_kernel<Kernel, 3>(av.pQueue, ext, tile, f);
6093 #else //if __KALMAR_ACCELERATOR__ != 1 6097 auto foo = &Kernel::__cxxamp_trampoline;
6098 auto bar = &Kernel::operator();
6101 #pragma clang diagnostic pop
bool operator==(const accelerator_view &other) const
Compares "this" accelerator_view with the passed accelerator_view object to determine if they represe...
Definition: amp.h:204
T & operator()(const index< N > &idx)
Returns a reference to the element of this array that is at the location in N-dimensional space speci...
Definition: amp.h:3026
array(const extent< N > &ext)
Constructs a new array with the supplied extent, located on the default view of the default accelerat...
Definition: amp.h:2515
unsigned int atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val)
These functions attempt to perform these three steps atomically:
tiled_extent< D0 > tile() const
Produces a tiled_extent object with the tile extents given by D0, D1, and D2.
Definition: amp.h:922
completion_future synchronize_async() const
An asynchronous version of synchronize, which returns a completion future object. ...
Definition: amp.h:3664
extent & operator%=(int value)
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: amp.h:1051
array(const array_view< const T, N > &src, accelerator_view av, accelerator_view associated_av)
Constructs a staging array initialized with the array_view given by "src", which acts as a staging ar...
Definition: amp.h:2841
void copy_to(array< T, N > &dest) const
Copies the data referred to by this array_view to the array given by "dest", as if by calling "copy(*...
Definition: amp.h:4197
array_view< T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2)
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: amp.h:3221
array_view & operator=(const array_view &other)
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: amp.h:4177
array_view(const array< T, N > &src)
Constructs an array_view which is bound to the data contained in the "src" array. ...
Definition: amp.h:4045
array(int e0, int e1, int e2)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:2529
array_view< const T, N > section(const Concurrency::index< N > &idx, const Concurrency::extent< N > &ext) const
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: amp.h:4426
tiled_extent(const tiled_extent &other)
Copy constructor.
Definition: amp.h:1849
array(const Concurrency::extent< N > &ext, InputIter srcBegin, InputIter srcEnd)
Constructs a new array with the supplied extent, located on the default accelerator, initialized with the contents of a source container specified by a beginning and optional ending iterator.
Definition: amp.h:2552
array_view(const extent< N > &extent, const Container &src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: amp.h:4065
array(const Concurrency::extent< N > &ext, InputIter srcBegin)
Constructs a new array with the supplied extent, located on the default accelerator, initialized with the contents of a source container specified by a beginning and optional ending iterator.
Definition: amp.h:2549
array_view< const T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:4469
const Concurrency::extent< 1 > tile_extent
Returns an instance of an extent<N> that captures the values of the tiled_index template arguments D0...
Definition: amp.h:1522
array(const Concurrency::extent< N > &ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator_vi...
Definition: amp.h:2813
void copy_to(const array_view< T, N > &dest) const
Copies the contents of this array to the array_view given by "dest", as if by calling "copy(*this...
Definition: amp.h:2975
tiled_extent()
Default constructor.
Definition: amp.h:1728
void synchronize_to(const accelerator_view &av) const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: amp.h:4295
completion_future()
Default constructor.
Definition: amp.h:579
array_view< T, K > view_as(Concurrency::extent< K > viewExtent) const
This member function is similar to "array<T,N>::view_as", although it only supports array_views of ra...
Definition: amp.h:3934
T value_type
The element type of this array.
Definition: amp.h:3395
extent(int components[])
Constructs an extent<N> with the coordinate values provided the array of int component values...
Definition: amp.h:856
accelerator_view create_view(queuing_mode qmode=queuing_mode_automatic)
Creates and returns a new accelerator view on the accelerator with the supplied queuing mode...
Definition: amp.h:427
std::future_status wait_for(const std::chrono::duration< _Rep, _Period > &_Rel_time) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: amp.h:675
accelerator_view get_accelerator_view() const
This property returns the accelerator_view representing the location where this array has been alloca...
Definition: amp.h:2889
extent & operator--()
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: amp.h:1075
extent & operator+=(const extent &__r)
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: amp.h:971
const T & operator()(int i0, int i1, int i2) const
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: amp.h:3086
void wait_with_global_memory_fence() const
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: amp.h:1222
array_view(int e0, const value_type *src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: amp.h:4115
const index< 1 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: amp.h:1477
array(int e0)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:2525
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: amp.h:1628
void copy(const array_view< const T, N > &src, const array_view< T, N > &dest)
The contents of "src" are copied into "dest".
Definition: amp.h:4979
void synchronize() const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: amp.h:3651
bool get_supports_limited_double_precision() const
Returns a boolean value indicating whether the accelerator has limited double precision support (excl...
Definition: amp.h:526
void copy_to(const array_view< T, N > &dest) const
Copies the contents of this array_view to the array_view given by "dest", as if by calling "copy(*thi...
Definition: amp.h:4206
const index< 2 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: amp.h:1611
bool contains(const index< N > &idx) const
Tests whether the index "idx" is properly contained within this extent (with an assumed origin of zer...
Definition: amp.h:895
array_view< const T, K > view_as(const Concurrency::extent< K > &viewExtent) const
An array of higher rank can be reshaped into an array of lower rank, or vice versa, using the view_as member function.
Definition: amp.h:3303
void copy(const array_view< const T, 1 > &src, const array_view< T, 1 > &dest)
The contents of "src" are copied into "dest".
Definition: amp.h:5042
array_view< ElementType, N > reinterpret_as() const
This member function is similar to "array<T,N>::reinterpret_as", although it only supports array_view...
Definition: amp.h:3909
array_view< const ElementType, N > reinterpret_as() const
This member function is similar to "array<T,N>::reinterpret_as", although it only supports array_view...
Definition: amp.h:4488
accelerator_view(const accelerator_view &other)
Copy-constructs an accelerator_view object.
Definition: amp.h:82
accelerator_view get_source_accelerator_view() const
Access the accelerator_view where the data source of the array_view is located.
Definition: amp.h:4157
array(int e0, int e1, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: amp.h:2574
T & operator()(int i0, int i1, int i2)
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: amp.h:3069
const T & operator[](const index< N > &idx) const
Returns a const reference to the element of this array that is at the location in N-dimensional space...
Definition: amp.h:3043
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: amp.h:2746
const index< 3 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: amp.h:1342
array_projection_helper< T, N >::result_type operator()(int i0)
This overload is defined for array<T,N> where .
Definition: amp.h:3114
bool get_has_display() const
This property indicates that the accelerator may be shared by (and thus have interference from) the o...
Definition: amp.h:504
array(const array_view< const T, N > &src, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array initialized with the contents of the array_view "src".
Definition: amp.h:2720
array(int e0, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: amp.h:2643
array_view(const extent< N > &ext, const value_type *src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: amp.h:4079
std::wstring get_description() const
Returns a short textual description of the accelerator device.
Definition: amp.h:486
bool set_default_cpu_access_type(access_type default_cpu_access_type)
Sets the default_cpu_access_type for this accelerator.
Definition: amp.h:471
extent< N > operator/(const extent< N > &ext, int value)
Binary arithmetic operations that produce a new extent<N> that is the result of performing the corres...
Definition: amp.h:4651
array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: amp.h:2872
int value_type
The element type of extent<N>.
Definition: amp.h:796
array(int e0, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: amp.h:2568
extent operator++(int)
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: amp.h:1070
bool operator!=(const accelerator &other) const
Compares "this" accelerator with the passed accelerator object to determine if they represent differe...
Definition: amp.h:451
std::future_status wait_until(const std::chrono::time_point< _Clock, _Duration > &_Abs_time) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: amp.h:680
void synchronize_to(const accelerator_view &av) const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: amp.h:3703
Represents a logical (isolated) accelerator view of a compute accelerator.
Definition: amp.h:73
tiled_extent()
Default constructor.
Definition: amp.h:1948
void synchronize() const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: amp.h:4267
array(int e0, int e1, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: amp.h:2645
completion_future & operator=(const completion_future &other)
Copy assignment.
Definition: amp.h:611
bool operator!=(const extent &other) const
Compares two objects of extent<N>.
Definition: amp.h:957
array_view< T, 2 > section(int i0, int i1, int e0, int e1) const
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:3885
C++ AMP namespace.
Definition: amp.h:25
extent< N > get_extent() const
Access the extent that defines the shape of this array_view.
Definition: amp.h:3529
This class is the return type of all C++ AMP asynchronous APIs and has an interface analogous to std:...
Definition: amp.h:571
extent(_Tp...__t)
Constructs an extent<N> with the coordinate values provided by .
Definition: amp.h:828
std::wstring get_device_path() const
Returns a system-wide unique device instance path that matches the "Device Instance Path" property fo...
Definition: amp.h:481
Definition: kalmar_exception.h:51
extent & operator-=(const index< N > &idx)
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: amp.h:1019
tiled_extent()
Default constructor.
Definition: amp.h:1840
void global_memory_fence(const tile_barrier &)
Establishes a thread-tile scoped memory fence for global (but not tile-static) memory operations...
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: amp.h:2875
extent & operator*=(const extent &__r)
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: amp.h:981
array_view< const T, K > view_as(Concurrency::extent< K > viewExtent) const
This member function is similar to "array<T,N>::view_as", although it only supports array_views of ra...
Definition: amp.h:4511
T & operator[](const index< N > &idx)
Returns a reference to the element of this array that is at the location in N-dimensional space speci...
Definition: amp.h:3017
array_view & operator=(const array_view &other)
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: amp.h:3550
array(const array &other)
Copy constructor.
Definition: amp.h:2493
array(const Concurrency::extent< N > &ext, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator vi...
Definition: amp.h:2769
array_view(int e0, value_type *src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: amp.h:3494
tiled_extent truncate() const
Returns a new tiled_extent with the extents adjusted down to be evenly divisible by the tile dimensio...
Definition: amp.h:1886
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: amp.h:2577
tiled_extent(const extent< 3 > &ext)
Constructs a tiled_extent<N> with the extent "ext".
Definition: amp.h:1746
extent< N > operator+(const extent< N > &lhs, const extent< N > &rhs)
Adds (or subtracts) two objects of extent<N> to form a new extent.
Definition: amp.h:4584
unsigned int atomic_exchange(unsigned int *dest, unsigned int val)
Atomically read the value stored in dest , replace it with the value given in val and return the old ...
array(int e0, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: amp.h:2571
namespace for internal classes of Kalmar compiler / runtime
Definition: hc.hpp:42
array_view< ElementType, 1 > reinterpret_as()
Sometimes it is desirable to view the data of an N-dimensional array as a linear array, possibly with a (unsafe) reinterpretation of the element type.
Definition: amp.h:3254
array_view< T, N > section(const extent< N > &ext)
Equivalent to "section(index<N>(), ext)".
Definition: amp.h:3184
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: amp.h:1301
void wait()
Performs a blocking wait for completion of all commands submitted to the accelerator view prior to ca...
Definition: amp.h:151
unsigned atomic_fetch_sub(unsigned *x, unsigned y)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
tiled_index(const tiled_index< D0, D1, D2 > &o)
Copy constructor.
Definition: amp.h:1323
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: amp.h:2752
extent & operator-=(int value)
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: amp.h:1039
unsigned int size() const
This member function returns the total linear size of this extent<N> (in units of elements)...
Definition: amp.h:904
array_view< T, N > section(const index< N > &idx)
Equivalent to "section(idx, this->extent – idx)".
Definition: amp.h:3165
array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: amp.h:2866
accelerator(const accelerator &other)
Copy constructs an accelerator object.
Definition: amp.h:344
The tile_barrier class is a capability class that is only creatable by the system, and passed to a tiled parallel_for_each function object as part of the tiled_index parameter.
Definition: amp.h:1152
void refresh() const
Calling this member function informs the array_view that its bound memory has been modified outside t...
Definition: amp.h:4236
array_view< const T, N > section(const Concurrency::extent< N > &ext) const
Equivalent to "section(index<N>(), ext)".
Definition: amp.h:4444
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: amp.h:2869
const Concurrency::extent< 2 > tile_extent
Returns an instance of an extent<N> that captures the values of the tiled_index template arguments D0...
Definition: amp.h:1656
tiled_extent< D0, D1 > tile() const
Produces a tiled_extent object with the tile extents given by D0, D1, and D2.
Definition: amp.h:928
access_type get_default_cpu_access_type() const
Get the default cpu access_type for buffers created on this accelerator.
Definition: amp.h:550
extent & operator/=(const extent &__r)
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: amp.h:986
static std::vector< accelerator > get_all()
Returns a std::vector of accelerator objects (in no specific order) representing all accelerators tha...
Definition: amp.h:353
const T & operator()(int i0, int i1) const
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: amp.h:3083
tiled_extent(const extent< 2 > &ext)
Constructs a tiled_extent<N> with the extent "ext".
Definition: amp.h:1858
array_view< T, 2 > section(int i0, int i1, int e0, int e1) const
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: amp.h:3213
accelerator_view get_default_view() const
Returns the default accelerator_view associated with the accelerator.
Definition: amp.h:417
Definition: kalmar_exception.h:22
bool get_is_emulated() const
Returns a boolean value indicating whether the accelerator is emulated.
Definition: amp.h:539
extent & operator+=(int value)
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: amp.h:1035
array_projection_helper< T, N >::const_result_type operator()(int i0) const
This overload is defined for array<T,N> where .
Definition: amp.h:3122
bool get_is_debug() const
Returns a boolean value indicating whether the accelerator supports debugging.
Definition: amp.h:533
extent & operator*=(int value)
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: amp.h:1043
accelerator()
Constructs a new accelerator object that represents the default accelerator.
Definition: amp.h:317
static accelerator_view get_auto_selection_view()
Returns an accelerator_view which when passed as the first argument to a parallel_for_each call cause...
Definition: amp.h:393
T * data() const
Returns a pointer to the raw data underlying this array.
Definition: amp.h:2984
tiled_extent pad() const
Returns a new tiled_extent with the extents adjusted up to be evenly divisible by the tile dimensions...
Definition: amp.h:1874
array_view< T, N > section(const Concurrency::extent< N > &ext) const
Equivalent to "section(index<N>(), ext)".
Definition: amp.h:3865
array_view(int e0)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:3509
Definition: kalmar_exception.h:42
array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: amp.h:2743
const index< 2 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: amp.h:1605
array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: amp.h:2792
extent(const extent &other)
Copy constructor.
Definition: amp.h:812
array_view< T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:3890
unsigned atomic_fetch_add(unsigned *x, unsigned y)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
completion_future & operator=(completion_future &&other)
Move assignment.
Definition: amp.h:628
void then(const functor &func)
This method enables specification of a completion callback func which is executed upon completion of ...
Definition: amp.h:706
T * data() const
Returns a pointer to the first data element underlying this array_view.
Definition: amp.h:3602
int atomic_fetch_max(int *dest, int val)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
void wait() const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: amp.h:669
extent< N > operator*(const extent< N > &ext, int value)
Binary arithmetic operations that produce a new extent<N> that is the result of performing the corres...
Definition: amp.h:4639
const Concurrency::extent< 3 > tile_extent
Returns an instance of an extent<N> that captures the values of the tiled_index template arguments D0...
Definition: amp.h:1381
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: amp.h:1353
const index< 1 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: amp.h:1471
array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: amp.h:2740
extent< N > operator-(const extent< N > &lhs, const extent< N > &rhs)
Adds (or subtracts) two objects of extent<N> to form a new extent.
Definition: amp.h:4590
array_view< T, N > section(const Concurrency::index< N > &idx, const Concurrency::extent< N > &ext) const
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: amp.h:3843
extent< N > get_extent() const
Access the extent that defines the shape of this array_view.
Definition: amp.h:4146
bool operator==(const extent &other) const
Compares two objects of extent<N>.
Definition: amp.h:954
tiled_extent(const tiled_extent &other)
Copy constructor.
Definition: amp.h:1957
array_view< const T, N > section(const extent< N > &ext) const
Equivalent to "section(index<N>(), ext)".
Definition: amp.h:3188
accelerator & operator=(const accelerator &other)
Assigns an accelerator object to "this" accelerator object and returns a reference to "this" object...
Definition: amp.h:406
void copy_to(array< T, N > &dest) const
Copies the data referred to by this array_view to the array given by "dest", as if by calling "copy(*...
Definition: amp.h:3568
extent & operator++()
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: amp.h:1066
array_view< T, K > view_as(const Concurrency::extent< K > &viewExtent)
An array of higher rank can be reshaped into an array of lower rank, or vice versa, using the view_as member function.
Definition: amp.h:3294
bool get_supports_double_precision() const
Returns a Boolean value indicating whether this accelerator supports double-precision (double) comput...
Definition: amp.h:518
Concurrency::extent< 3 > get_tile_extent() const
Returns an instance of an extent<N> that captures the values of the tiled_index template arguments D0...
Definition: amp.h:1378
array_view< const T, N > section(const Concurrency::index< N > &idx) const
Equivalent to "section(idx, this->extent – idx)".
Definition: amp.h:4435
tiled_extent truncate() const
Returns a new tiled_extent with the extents adjusted down to be evenly divisible by the tile dimensio...
Definition: amp.h:1775
array_view(const array_view &other)
Copy constructor.
Definition: amp.h:3523
completion_future create_marker()
This command inserts a marker event into the accelerator_view's command queue.
Definition: amp.h:768
array_view(int e0, Container &src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: amp.h:3475
array_view(const Concurrency::extent< N > &ext, value_type *src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: amp.h:3444
array(const Concurrency::extent< N > &ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator_vi...
Definition: amp.h:2816
extent(const int components[])
Constructs an extent<N> with the coordinate values provided the array of int component values...
Definition: amp.h:844
completion_future synchronize_async() const
An asynchronous version of synchronize, which returns a completion future object. ...
Definition: amp.h:4279
array_view & operator=(const array_view< T, N > &other)
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: amp.h:4168
array(int e0, int e1, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: amp.h:2790
The partial specialization array_view<const T,N> represents a view over elements of type const T with...
Definition: amp.h:4011
Represents a unique position in N-dimensional space.
Definition: amp.h:31
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: amp.h:2583
Represents an N-dimensional region of memory (with type T) located on an accelerator.
Definition: amp.h:30
const T & operator()(const index< N > &idx) const
Returns a const reference to the element of this array that is at the location in N-dimensional space...
Definition: amp.h:3052
array_view< T, 2 > section(int i0, int i1, int e0, int e1)
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: amp.h:3217
const index< 1 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: amp.h:1489
array_view(const array_view &other)
Copy constructor.
Definition: amp.h:4130
array(const extent< N > &ext, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array with the supplied extent, located on the accelerator bound to the accelerator_...
Definition: amp.h:2625
tiled_index(const tiled_index< D0, D1 > &o)
Copy constructor.
Definition: amp.h:1598
const index< 3 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: amp.h:1348
accelerator_view get_associated_accelerator_view() const
This property returns the accelerator_view representing the preferred target where this array can be ...
Definition: amp.h:2895
array_view< const T, 1 > section(int i0, int e0) const
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:4459
array(const array_view< const T, N > &src)
Constructs a new array, located on the default view of the default accelerator, initialized with the ...
Definition: amp.h:2599
tiled_extent< D0, D1, D2 > tile() const
Produces a tiled_extent object with the tile extents given by D0, D1, and D2.
Definition: amp.h:934
array_view< T, 1 > section(int i0, int e0) const
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:3880
array(int e0, int e1, int e2, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: amp.h:2580
array(array &&other)
Move constructor.
Definition: amp.h:2504
array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: amp.h:2647
Represents a unique position in N-dimensional space.
Definition: kalmar_index.h:226
static const int tile_dim2
These constants allow access to the template arguments of tiled_extent.
Definition: amp.h:1804
bool get_is_auto_selection()
Returns a boolean value indicating whether the accelerator view when passed to a parallel_for_each wo...
Definition: amp.h:119
tile_barrier(const tile_barrier &other)
Copy constructor.
Definition: amp.h:1175
Concurrency::extent< 1 > get_tile_extent() const
Returns an instance of an extent<N> that captures the values of the tiled_index template arguments D0...
Definition: amp.h:1519
extent & operator/=(int value)
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: amp.h:1047
extent< N > operator%(const extent< N > &ext, int value)
Binary arithmetic operations that produce a new extent<N> that is the result of performing the corres...
Definition: amp.h:4663
array_view< T, N > section(const Concurrency::index< N > &origin, const Concurrency::extent< N > &ext)
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: amp.h:3146
array(const Concurrency::extent< N > &ext, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array with the supplied extent, located on the accelerator bound to the accelerator_...
Definition: amp.h:2680
Represents an extent subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: amp.h:32
void all_memory_fence(const tile_barrier &)
Establishes a thread-tile scoped memory fence for both global and tile-static memory operations...
array_view< const T, N > section(const Concurrency::index< N > &origin, const Concurrency::extent< N > &ext) const
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: amp.h:3154
unsigned int get_version() const
Returns a 32-bit unsigned integer representing the version number of this accelerator.
Definition: amp.h:494
static bool set_default(const std::wstring &path)
Sets the default accelerator to the device path identified by the "path" argument.
Definition: amp.h:374
void copy_to(const array_view &dest) const
Copies the contents of this array_view to the array_view given by "dest", as if by calling "copy(*thi...
Definition: amp.h:3585
tiled_extent(const extent< 1 > &ext)
Constructs a tiled_extent<N> with the extent "ext".
Definition: amp.h:1966
const index< 2 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: amp.h:1617
void wait_with_tile_static_memory_fence() const
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: amp.h:1240
bool operator!=(const accelerator_view &other) const
Compares "this" accelerator_view with the passed accelerator_view object to determine if they represe...
Definition: amp.h:216
array_view< T, 1 > section(int i0, int e0)
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: amp.h:3205
Concurrency::extent< 2 > get_tile_extent() const
Returns an instance of an extent<N> that captures the values of the tiled_index template arguments D0...
Definition: amp.h:1653
T & operator()(int i0, int i1)
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: amp.h:3066
tiled_index(const tiled_index< D0 > &o)
Copy constructor.
Definition: amp.h:1464
tiled_extent pad() const
Returns a new tiled_extent with the extents adjusted up to be evenly divisible by the tile dimensions...
Definition: amp.h:1982
completion_future(completion_future &&other)
Move constructor.
Definition: amp.h:600
extent & operator-=(const extent &__r)
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: amp.h:975
accelerator get_accelerator() const
Returns the accelerator that this accelerator_view has been created on.
Definition: amp.h:766
extent & operator%=(const extent &__r)
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: amp.h:991
queuing_mode get_queuing_mode() const
Returns the queuing mode that this accelerator_view was created with.
Definition: amp.h:105
static const int tile_dim0
These constants allow access to the template arguments of tiled_extent.
Definition: amp.h:1802
completion_future copy_async(const array< T, N > &src, array< T, N > &dest)
The contents of "src" are copied into "dest".
Definition: amp.h:5178
const index< 1 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: amp.h:1483
bool valid() const
This method is functionally identical to std::shared_future<void>::valid.
Definition: amp.h:652
void refresh() const
Calling this member function informs the array_view that its bound memory has been modified outside t...
Definition: amp.h:3615
extent operator-(const index< N > &idx)
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: amp.h:1010
unsigned int get_version() const
Returns a 32-bit unsigned integer representing the version number of this accelerator view...
Definition: amp.h:770
const index< 3 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: amp.h:1330
unsigned atomic_fetch_and(unsigned *x, unsigned y)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
void tile_static_memory_fence(const tile_barrier &)
Establishes a thread-tile scoped memory fence for tile-static (but not global) memory operations...
array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: amp.h:2749
extent(int e0)
Constructs an extent<N> with the coordinate values provided by .
Definition: amp.h:824
tiled_extent(const tiled_extent &other)
Copy constructor.
Definition: amp.h:1737
array_view(const Concurrency::extent< N > &extent, Container &src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: amp.h:3430
void wait_with_all_memory_fence() const
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: amp.h:1205
array_view< const T, 2 > section(int i0, int i1, int e0, int e1) const
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:4464
unsigned atomic_fetch_or(unsigned *x, unsigned y)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
extent()
Default constructor.
Definition: amp.h:802
array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: amp.h:2863
Represents a physical accelerated computing device.
Definition: amp.h:282
const T * data() const
Returns a pointer to the first data element underlying this array_view.
Definition: amp.h:4223
tiled_extent truncate() const
Returns a new tiled_extent with the extents adjusted down to be evenly divisible by the tile dimensio...
Definition: amp.h:1993
int atomic_fetch_inc(int *_Dest)
Atomically increment or decrement the value stored at the location point to by dest.
array(int e0, int e1)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:2527
completion_future(const completion_future &other)
Copy constructor.
Definition: amp.h:588
array_view(const Concurrency::extent< N > &ext)
Constructs an array_view which is not bound to a data source.
Definition: amp.h:3461
array & operator=(const array &other)
Assigns the contents of the array "other" to this array, using a deep copy.
Definition: amp.h:2910
accelerator_view get_source_accelerator_view() const
Access the accelerator_view where the data source of the array_view is located.
Definition: amp.h:3540
array & operator=(const array_view< T, N > &src)
Assigns the contents of the array_view "src", as if by calling "copy(src, *this)".
Definition: amp.h:2942
array_view< const T, 1 > section(int i0, int e0) const
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: amp.h:3209
array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: amp.h:2737
bool operator==(const accelerator &other) const
Compares "this" accelerator with the passed accelerator object to determine if they represent the sam...
Definition: amp.h:441
accelerator_view & operator=(const accelerator_view &other)
Assigns an accelerator_view object to "this" accelerator_view object and returns a reference to "this...
Definition: amp.h:94
The array_view<T,N> type represents a possibly cached view into the data held in an array<T...
Definition: amp.h:29
extent operator--(int)
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: amp.h:1079
const index< 3 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: amp.h:1336
access_type get_cpu_access_type() const
This property returns the CPU "access_type" allowed for this array.
Definition: amp.h:2900
void wait() const
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: amp.h:1188
array_view< T, N > section(const Concurrency::index< N > &idx) const
Equivalent to "section(idx, this->extent – idx)".
Definition: amp.h:3856
void discard_data() const
Indicates to the runtime that it may discard the current logical contents of this array_view...
Definition: amp.h:3734
array_view< const ElementType, 1 > reinterpret_as() const
Sometimes it is desirable to view the data of an N-dimensional array as a linear array, possibly with a (unsafe) reinterpretation of the element type.
Definition: amp.h:3267
void copy_to(array &dest) const
Copies the contents of this array to the array given by "dest", as if by calling "copy(*this, dest)".
Definition: amp.h:2956
tiled_extent pad() const
Returns a new tiled_extent with the extents adjusted up to be evenly divisible by the tile dimensions...
Definition: amp.h:1762
T value_type
The element type of this array.
Definition: amp.h:2478
unsigned atomic_fetch_xor(unsigned *x, unsigned y)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
array_view(array< T, N > &src)
Constructs an array_view which is bound to the data contained in the "src" array. ...
Definition: amp.h:3410
int atomic_fetch_dec(int *_Dest)
Atomically increment or decrement the value stored at the location point to by dest.
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: amp.h:1494
array_view< const T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: amp.h:3225
accelerator(const std::wstring &path)
Constructs a new accelerator object that represents the physical device named by the "path" argument...
Definition: amp.h:334
extent operator+(const index< N > &idx)
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: amp.h:1005
array(int e0, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: amp.h:2788
const T value_type
The element type of this array.
Definition: amp.h:4030
array_view(int e0, Container &src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: amp.h:4097
array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: amp.h:2860
const index< 2 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: amp.h:1623
void flush()
Sends the queued up commands in the accelerator_view to the device for execution. ...
Definition: amp.h:182
array_view< const T, N > section(const index< N > &idx) const
Equivalent to "section(idx, this->extent – idx)".
Definition: amp.h:3173
bool get_supports_cpu_shared_memory() const
Returns a boolean value indicating whether the accelerator supports memory accessible both by the acc...
Definition: amp.h:545
size_t get_dedicated_memory() const
Returns the amount of dedicated memory (in KB) on an accelerator device.
Definition: amp.h:511
bool get_is_debug() const
Returns a boolean value indicating whether the accelerator_view supports debugging through extensive ...
Definition: amp.h:145
int atomic_fetch_min(int *dest, int val)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
static const int tile_dim1
These constants allow access to the template arguments of tiled_extent.
Definition: amp.h:1803
extent & operator=(const extent &other)
Assigns the component values of "other" to this extent<N> object.
Definition: amp.h:866
array_view(const array_view< nc_T, N > &other)
Copy constructor.
Definition: amp.h:4140
Definition: kalmar_runtime.h:14
array & operator=(array &&other)
Moves the contents of the array "other" to this array.
Definition: amp.h:2925
Concurrency::extent< N > get_extent() const
Access the extent that defines the shape of this array.
Definition: amp.h:2883
extent & operator+=(const index< N > &idx)
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: amp.h:1015