HCC
HCC is a single-source, C/C++ compiler for heterogeneous computing. It's optimized with HSA (http://www.hsafoundation.com/).
amp.h
Go to the documentation of this file.
1 //===----------------------------------------------------------------------===//
2 //
3 // This file is distributed under the University of Illinois Open Source
4 // License. See LICENSE.TXT for details.
5 //
6 //===----------------------------------------------------------------------===//
7 
13 #pragma once
14 
15 #include "hc_defines.h"
16 #include "kalmar_exception.h"
17 #include "kalmar_index.h"
18 #include "kalmar_runtime.h"
19 #include "kalmar_buffer.h"
20 #include "kalmar_serialize.h"
21 #include "kalmar_launch.h"
22 #include "kalmar_cpu_launch.h"
23 
24 // forward declaration
25 namespace Concurrency {
26 class completion_future;
27 class accelerator;
28 class accelerator_view;
29 template <typename T, int N> class array_view;
30 template <typename T, int N> class array;
31 template <int N> class extent;
32 template <int D0, int D1=0, int D2=0> class tiled_extent;
33 } // namespace Concurrency
34 
35 // namespace alias
36 // namespace concurrency is an alias of namespace Concurrency
37 namespace concurrency = Concurrency;
38 
39 
40 // type alias
41 namespace Concurrency {
42 
46 template <int N>
48 
52 } // namespace Concurrency
53 
54 
59 namespace Concurrency {
60 
61 using namespace Kalmar::enums;
62 using namespace Kalmar::CLAMP;
63 
64 // ------------------------------------------------------------------------
65 // accelerator_view
66 // ------------------------------------------------------------------------
67 
74 public:
83  pQueue(other.pQueue) {}
84 
95  pQueue = other.pQueue;
96  return *this;
97  }
98 
105  queuing_mode get_queuing_mode() const { return pQueue->get_mode(); }
106 
118  // FIXME: dummy implementation now
119  bool get_is_auto_selection() { return false; }
120 
130  unsigned int get_version() const;
131 
135  accelerator get_accelerator() const;
136 
144  // FIXME: dummy implementation now
145  bool get_is_debug() const { return 0; }
146 
151  void wait() { pQueue->wait(); }
152 
182  void flush() { pQueue->flush(); }
183 
193  // FIXME: dummy implementation now
194  completion_future create_marker();
195 
204  bool operator==(const accelerator_view& other) const {
205  return pQueue == other.pQueue;
206  }
207 
216  bool operator!=(const accelerator_view& other) const { return !(*this == other); }
217 
218 private:
219  accelerator_view(std::shared_ptr<Kalmar::KalmarQueue> pQueue) : pQueue(pQueue) {}
220  std::shared_ptr<Kalmar::KalmarQueue> pQueue;
221  friend class accelerator;
222 
223  template<typename Kernel, int dim_ext> friend
224  void Kalmar::mcw_cxxamp_launch_kernel(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&);
225  template<typename Kernel, int dim_ext> friend
226  std::shared_future<void>* Kalmar::mcw_cxxamp_launch_kernel_async(const std::shared_ptr<Kalmar::KalmarQueue>&, size_t *, size_t *, const Kernel&);
227 
228 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
229  template <typename Kernel, int N> friend
230  void launch_cpu_task(const std::shared_ptr<Kalmar::KalmarQueue>&, Kernel const&, extent<N> const&);
231 #endif
232 
233  template <typename Q, int K> friend class array;
234  template <typename Q, int K> friend class array_view;
235 
236  template <int N, typename Kernel> friend
237  void parallel_for_each(Concurrency::extent<N>, const Kernel&);
238  template <int N, typename Kernel> friend
239  void parallel_for_each(const accelerator_view&, Concurrency::extent<N>, const Kernel&);
240  template <typename Kernel> friend
241  void parallel_for_each(const accelerator_view&, Concurrency::extent<1>, const Kernel&);
242  template <typename Kernel> friend
243  void parallel_for_each(const accelerator_view&, Concurrency::extent<2>, const Kernel&);
244  template <typename Kernel> friend
245  void parallel_for_each(const accelerator_view&, Concurrency::extent<3>, const Kernel&);
246 
247  template <int D0, typename Kernel> friend
248  void parallel_for_each(tiled_extent<D0>, const Kernel&);
249  template <int D0, typename Kernel> friend
250  void parallel_for_each(const accelerator_view&, tiled_extent<D0>, const Kernel&);
251 
252  template <int D0, int D1, typename Kernel> friend
253  void parallel_for_each(tiled_extent<D0,D1>, const Kernel&);
254  template <int D0, int D1, typename Kernel> friend
255  void parallel_for_each(const accelerator_view&, tiled_extent<D0, D1>, const Kernel&);
256 
257  template <int D0, int D1, int D2, typename Kernel> friend
258  void parallel_for_each(tiled_extent<D0,D1,D2>, const Kernel&);
259  template <int D0, int D1, int D2, typename Kernel> friend
260  void parallel_for_each(const accelerator_view&, tiled_extent<D0, D1, D2>, const Kernel&);
261 
262 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
263 public:
264 #endif
265  __attribute__((annotate("user_deserialize")))
266  accelerator_view() restrict(amp,cpu) {
267 #if __KALMAR_ACCELERATOR__ != 1
268  throw runtime_exception("errorMsg_throw", 0);
269 #endif
270  }
271 };
272 
273 // ------------------------------------------------------------------------
274 // accelerator
275 // ------------------------------------------------------------------------
276 
283 {
284 public:
285 
302  static const wchar_t default_accelerator[]; // = L"default"
303  static const wchar_t cpu_accelerator[]; // = L"cpu"
304 
317  accelerator() : accelerator(default_accelerator) {}
318 
334  explicit accelerator(const std::wstring& path)
335  : pDev(Kalmar::getContext()->getDevice(path)) {}
336 
344  accelerator(const accelerator& other) : pDev(other.pDev) {}
345 
353  static std::vector<accelerator> get_all() {
354  auto Devices = Kalmar::getContext()->getDevices();
355  std::vector<accelerator> ret(Devices.size());
356  for (std::size_t i = 0; i < ret.size(); ++i)
357  ret[i] = Devices[i];
358  return std::move(ret);
359  }
360 
374  static bool set_default(const std::wstring& path) {
375  return Kalmar::getContext()->set_default(path);
376  }
377 
394  return Kalmar::getContext()->auto_select();
395  }
396 
407  pDev = other.pDev;
408  return *this;
409  }
410 
417  accelerator_view get_default_view() const { return pDev->get_default_queue(); }
418 
427  accelerator_view create_view(queuing_mode qmode = queuing_mode_automatic) {
428  auto pQueue = pDev->createQueue();
429  pQueue->set_mode(qmode);
430  return pQueue;
431  }
432 
441  bool operator==(const accelerator& other) const { return pDev == other.pDev; }
442 
451  bool operator!=(const accelerator& other) const { return !(*this == other); }
452 
471  bool set_default_cpu_access_type(access_type default_cpu_access_type) {
472  pDev->set_access(default_cpu_access_type);
473  return true;
474  }
475 
481  std::wstring get_device_path() const { return pDev->get_path(); }
482 
486  std::wstring get_description() const { return pDev->get_description(); }
487 
494  unsigned int get_version() const { return pDev->get_version(); }
495 
503  // FIXME: dummy implementation now
504  bool get_has_display() const { return false; }
505 
511  size_t get_dedicated_memory() const { return pDev->get_mem(); }
512 
518  bool get_supports_double_precision() const { return pDev->is_double(); }
519 
526  bool get_supports_limited_double_precision() const { return pDev->is_lim_double(); }
527 
532  // FIXME: dummy implementation now
533  bool get_is_debug() const { return false; }
534 
539  bool get_is_emulated() const { return pDev->is_emulated(); }
540 
545  bool get_supports_cpu_shared_memory() const { return pDev->is_unified(); }
546 
550  access_type get_default_cpu_access_type() const { return pDev->get_access(); }
551 
552 private:
553  accelerator(Kalmar::KalmarDevice* pDev) : pDev(pDev) {}
554  friend class accelerator_view;
555  Kalmar::KalmarDevice* pDev;
556 };
557 
558 // ------------------------------------------------------------------------
559 // completion_future
560 // ------------------------------------------------------------------------
561 
572 public:
573 
580 
589  : __amp_future(other.__amp_future), __thread_then(other.__thread_then) {}
590 
601  : __amp_future(std::move(other.__amp_future)), __thread_then(other.__thread_then) {}
602 
612  if (this != &other) {
613  __amp_future = other.__amp_future;
614  __thread_then = other.__thread_then;
615  }
616  return (*this);
617  }
618 
629  if (this != &other) {
630  __amp_future = std::move(other.__amp_future);
631  __thread_then = other.__thread_then;
632  }
633  return (*this);
634  }
635 
643  void get() const {
644  __amp_future.get();
645  }
646 
652  bool valid() const {
653  return __amp_future.valid();
654  }
655 
669  void wait() const {
670  if(this->valid())
671  __amp_future.wait();
672  }
673 
674  template <class _Rep, class _Period>
675  std::future_status wait_for(const std::chrono::duration<_Rep, _Period>& _Rel_time) const {
676  return __amp_future.wait_for(_Rel_time);
677  }
678 
679  template <class _Clock, class _Duration>
680  std::future_status wait_until(const std::chrono::time_point<_Clock, _Duration>& _Abs_time) const {
681  return __amp_future.wait_until(_Abs_time);
682  }
683 
691  operator std::shared_future<void>() const {
692  return __amp_future;
693  }
694 
701  // FIXME: notice we removed const from the signature here
702  // the original signature in the specification should be
703  // template<typename functor>
704  // void then(const functor& func) const;
705  template<typename functor>
706  void then(const functor & func) {
707 #if __KALMAR_ACCELERATOR__ != 1
708  // could only assign once
709  if (__thread_then == nullptr) {
710  // spawn a new thread to wait on the future and then execute the callback functor
711  __thread_then = new std::thread([&]() restrict(cpu) {
712  this->wait();
713  if(this->valid())
714  func();
715  });
716  }
717 #endif
718  }
719 
720  ~completion_future() {
721  if (__thread_then != nullptr) {
722  __thread_then->join();
723  }
724  delete __thread_then;
725  __thread_then = nullptr;
726  }
727 
728 private:
729  std::shared_future<void> __amp_future;
730  std::thread* __thread_then = nullptr;
731 
732  completion_future(const std::shared_future<void> &__future)
733  : __amp_future(__future) {}
734 
735  template <typename T, int N> friend
737  template <typename T, int N> friend
739  template <typename T, int N> friend
740  completion_future copy_async(const array<T, N>& src, const array_view<T, N>& dest);
741  template <typename T, int N> friend
743  template <typename T, int N> friend
745 
746  template <typename InputIter, typename T, int N> friend
747  completion_future copy_async(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest);
748  template <typename InputIter, typename T, int N> friend
749  completion_future copy_async(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest);
750  template <typename InputIter, typename T, int N> friend
751  completion_future copy_async(InputIter srcBegin, array<T, N>& dest);
752  template <typename InputIter, typename T, int N> friend
753  completion_future copy_async(InputIter srcBegin, const array_view<T, N>& dest);
754  template <typename OutputIter, typename T, int N> friend
755  completion_future copy_async(const array<T, N>& src, OutputIter destBegin);
756  template <typename OutputIter, typename T, int N> friend
757  completion_future copy_async(const array_view<T, N>& src, OutputIter destBegin);
758 
759  template <typename T, int N> friend class array_view;
760 };
761 
762 // ------------------------------------------------------------------------
763 // member function implementations
764 // ------------------------------------------------------------------------
765 
766 inline accelerator accelerator_view::get_accelerator() const { return pQueue->getDev(); }
767 
769 
770 inline unsigned int accelerator_view::get_version() const { return get_accelerator().get_version(); }
771 
772 
773 // ------------------------------------------------------------------------
774 // extent
775 // ------------------------------------------------------------------------
776 
784 template <int N>
785 class extent {
786 public:
787 
791  static const int rank = N;
792 
796  typedef int value_type;
797 
802  extent() restrict(amp,cpu) : base_() {
803  static_assert(N > 0, "Dimensionality must be positive");
804  };
805 
812  extent(const extent& other) restrict(amp,cpu)
813  : base_(other.base_) {}
814 
824  explicit extent(int e0) restrict(amp,cpu)
825  : base_(e0) {}
826 
827  template <typename ..._Tp>
828  explicit extent(_Tp ... __t) restrict(amp,cpu)
829  : base_(__t...) {
830  static_assert(sizeof...(__t) <= 3, "Can only supply at most 3 individual coordinates in the constructor");
831  static_assert(sizeof...(__t) == N, "rank should be consistency");
832  }
833 
844  explicit extent(const int components[]) restrict(amp,cpu)
845  : base_(components) {}
846 
855  // FIXME: this function is not defined in C++AMP specification.
856  explicit extent(int components[]) restrict(amp,cpu)
857  : base_(components) {}
858 
866  extent& operator=(const extent& other) restrict(amp,cpu) {
867  base_.operator=(other.base_);
868  return *this;
869  }
870 
878  int operator[] (unsigned int c) const restrict(amp,cpu) {
879  return base_[c];
880  }
881  int& operator[] (unsigned int c) restrict(amp,cpu) {
882  return base_[c];
883  }
884 
895  bool contains(const index<N>& idx) const restrict(amp,cpu) {
896  return Kalmar::amp_helper<N, index<N>, extent<N>>::contains(idx, *this);
897  }
898 
904  unsigned int size() const restrict(amp,cpu) {
905  return Kalmar::index_helper<N, extent<N>>::count_size(*this);
906  }
907 
908 
921  template <int D0>
922  tiled_extent<D0> tile() const restrict(amp,cpu) {
923  static_assert(N == 1, "One-dimensional tile() method only available on extent<1>");
924  static_assert(D0 >0, "All tile dimensions must be positive");
925  return tiled_extent<D0>(*this);
926  }
927  template <int D0, int D1>
928  tiled_extent<D0, D1> tile() const restrict(amp,cpu) {
929  static_assert(N == 2, "Two-dimensional tile() method only available on extent<2>");
930  static_assert(D0 >0 && D1 > 0, "All tile dimensions must be positive");
931  return tiled_extent<D0, D1>(*this);
932  }
933  template <int D0, int D1, int D2>
934  tiled_extent<D0, D1, D2> tile() const restrict(amp,cpu) {
935  static_assert(N == 3, "Three-dimensional tile() method only available on extent<3>");
936  static_assert(D0 >0 && D1 > 0 && D2 > 0, "All tile dimensions must be positive");
937  return tiled_extent<D0, D1, D2>(*this);
938  }
939 
952  // FIXME: the signature is not entirely the same as defined in:
953  // C++AMP spec v1.2 #1255
954  bool operator==(const extent& other) const restrict(amp,cpu) {
955  return Kalmar::index_helper<N, extent<N> >::equal(*this, other);
956  }
957  bool operator!=(const extent& other) const restrict(amp,cpu) {
958  return !(*this == other);
959  }
960 
971  extent& operator+=(const extent& __r) restrict(amp,cpu) {
972  base_.operator+=(__r.base_);
973  return *this;
974  }
975  extent& operator-=(const extent& __r) restrict(amp,cpu) {
976  base_.operator-=(__r.base_);
977  return *this;
978  }
979 
980  // FIXME: this function is not defined in C++AMP specification.
981  extent& operator*=(const extent& __r) restrict(amp,cpu) {
982  base_.operator*=(__r.base_);
983  return *this;
984  }
985  // FIXME: this function is not defined in C++AMP specification.
986  extent& operator/=(const extent& __r) restrict(amp,cpu) {
987  base_.operator/=(__r.base_);
988  return *this;
989  }
990  // FIXME: this function is not defined in C++AMP specification.
991  extent& operator%=(const extent& __r) restrict(amp,cpu) {
992  base_.operator%=(__r.base_);
993  return *this;
994  }
1005  extent operator+(const index<N>& idx) restrict(amp,cpu) {
1006  extent __r = *this;
1007  __r += idx;
1008  return __r;
1009  }
1010  extent operator-(const index<N>& idx) restrict(amp,cpu) {
1011  extent __r = *this;
1012  __r -= idx;
1013  return __r;
1014  }
1015  extent& operator+=(const index<N>& idx) restrict(amp,cpu) {
1016  base_.operator+=(idx.base_);
1017  return *this;
1018  }
1019  extent& operator-=(const index<N>& idx) restrict(amp,cpu) {
1020  base_.operator-=(idx.base_);
1021  return *this;
1022  }
1023 
1035  extent& operator+=(int value) restrict(amp,cpu) {
1036  base_.operator+=(value);
1037  return *this;
1038  }
1039  extent& operator-=(int value) restrict(amp,cpu) {
1040  base_.operator-=(value);
1041  return *this;
1042  }
1043  extent& operator*=(int value) restrict(amp,cpu) {
1044  base_.operator*=(value);
1045  return *this;
1046  }
1047  extent& operator/=(int value) restrict(amp,cpu) {
1048  base_.operator/=(value);
1049  return *this;
1050  }
1051  extent& operator%=(int value) restrict(amp,cpu) {
1052  base_.operator%=(value);
1053  return *this;
1054  }
1055 
1066  extent& operator++() restrict(amp,cpu) {
1067  base_.operator+=(1);
1068  return *this;
1069  }
1070  extent operator++(int) restrict(amp,cpu) {
1071  extent ret = *this;
1072  base_.operator+=(1);
1073  return ret;
1074  }
1075  extent& operator--() restrict(amp,cpu) {
1076  base_.operator-=(1);
1077  return *this;
1078  }
1079  extent operator--(int) restrict(amp,cpu) {
1080  extent ret = *this;
1081  base_.operator-=(1);
1082  return ret;
1083  }
1084 
1087  // FIXME: this function is not defined in C++AMP specification.
1088  template <int D0, int D1, int D2>
1089  explicit extent(const tiled_extent<D0, D1, D2>& other) restrict(amp,cpu)
1090  : base_(other.base_) {}
1091 
1092 private:
1093  typedef Kalmar::index_impl<typename Kalmar::__make_indices<N>::type> base;
1094  base base_;
1095  template <int K, typename Q> friend struct Kalmar::index_helper;
1096  template <int K, typename Q1, typename Q2> friend struct Kalmar::amp_helper;
1097 };
1098 
1099 // ------------------------------------------------------------------------
1100 // utility class for tiled_barrier
1101 // ------------------------------------------------------------------------
1102 
1103 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1104 template <typename Ker, typename Ti>
1105 void bar_wrapper(Ker *f, Ti *t)
1106 {
1107  (*f)(*t);
1108 }
1109 
1110 struct barrier_t {
1111  std::unique_ptr<ucontext_t[]> ctx;
1112  int idx;
1113  barrier_t (int a) :
1114  ctx(new ucontext_t[a + 1]) {}
1115  template <typename Ti, typename Ker>
1116  void setctx(int x, char *stack, Ker& f, Ti* tidx, int S) {
1117  getcontext(&ctx[x]);
1118  ctx[x].uc_stack.ss_sp = stack;
1119  ctx[x].uc_stack.ss_size = S;
1120  ctx[x].uc_link = &ctx[x - 1];
1121  makecontext(&ctx[x], (void (*)(void))bar_wrapper<Ker, Ti>, 2, &f, tidx);
1122  }
1123  void swap(int a, int b) {
1124  swapcontext(&ctx[a], &ctx[b]);
1125  }
1126  void wait() {
1127  --idx;
1128  swapcontext(&ctx[idx + 1], &ctx[idx]);
1129  }
1130 };
1131 #endif
1132 
1133 #ifndef CLK_LOCAL_MEM_FENCE
1134 #define CLK_LOCAL_MEM_FENCE (1)
1135 #endif
1136 
1137 #ifndef CLK_GLOBAL_MEM_FENCE
1138 #define CLK_GLOBAL_MEM_FENCE (2)
1139 #endif
1140 
1141 // ------------------------------------------------------------------------
1142 // tile_barrier
1143 // ------------------------------------------------------------------------
1144 
1153 public:
1154 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1155  using pb_t = std::shared_ptr<barrier_t>;
1156  tile_barrier(pb_t pb) : pbar(pb) {}
1157 
1165  tile_barrier(const tile_barrier& other) restrict(amp,cpu) : pbar(other.pbar) {}
1166 #else
1167 
1175  tile_barrier(const tile_barrier& other) restrict(amp,cpu) {}
1176 #endif
1177 
1188  void wait() const restrict(amp) {
1189 #if __KALMAR_ACCELERATOR__ == 1
1190  wait_with_all_memory_fence();
1191 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1192  pbar->wait();
1193 #endif
1194  }
1195 
1205  void wait_with_all_memory_fence() const restrict(amp) {
1206 #if __KALMAR_ACCELERATOR__ == 1
1207  amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
1208 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1209  pbar->wait();
1210 #endif
1211  }
1212 
1222  void wait_with_global_memory_fence() const restrict(amp) {
1223 #if __KALMAR_ACCELERATOR__ == 1
1224  amp_barrier(CLK_GLOBAL_MEM_FENCE);
1225 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1226  pbar->wait();
1227 #endif
1228  }
1229 
1240  void wait_with_tile_static_memory_fence() const restrict(amp) {
1241 #if __KALMAR_ACCELERATOR__ == 1
1242  amp_barrier(CLK_LOCAL_MEM_FENCE);
1243 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1244  pbar->wait();
1245 #endif
1246  }
1247 
1248 private:
1249 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1250  tile_barrier() restrict(amp,cpu) = default;
1251  pb_t pbar;
1252 #else
1253  tile_barrier() restrict(amp) {}
1254 #endif
1255 
1256  template<int D0, int D1, int D2> friend
1257  class tiled_index;
1258 };
1259 
1260 // ------------------------------------------------------------------------
1261 // other memory fences
1262 // ------------------------------------------------------------------------
1263 
1269 // FIXME: this functions has not been implemented.
1270 void all_memory_fence(const tile_barrier&) restrict(amp);
1271 
1277 // FIXME: this functions has not been implemented.
1278 void global_memory_fence(const tile_barrier&) restrict(amp);
1279 
1285 // FIXME: this functions has not been implemented.
1286 void tile_static_memory_fence(const tile_barrier&) restrict(amp);
1287 
1288 // ------------------------------------------------------------------------
1289 // tiled_index
1290 // ------------------------------------------------------------------------
1291 
1300 template <int D0, int D1=0, int D2=0>
1302 public:
1307  static const int rank = 3;
1308 
1309  // FIXME: missing constructor:
1310  // tiled_index(const index<N>& global,
1311  // const index<N>& local,
1312  // const index<N>& tile,
1313  // const index<N>& tile_origin,
1314  // const tile_barrier& barrier) restrict(amp,cpu);
1315 
1323  tiled_index(const tiled_index<D0, D1, D2>& o) restrict(amp, cpu)
1324  : global(o.global), local(o.local), tile(o.tile), tile_origin(o.tile_origin), barrier(o.barrier) {}
1325 
1331 
1337 
1343 
1349 
1354 
1360  operator const index<3>() const restrict(amp,cpu) {
1361  return global;
1362  }
1363 
1378  Concurrency::extent<3> get_tile_extent() const restrict(amp, cpu) {
1379  return tile_extent;
1380  }
1382 
1389  static const int tile_dim0 = D0;
1390  static const int tile_dim1 = D1;
1391  static const int tile_dim2 = D2;
1392 
1396  // FIXME: this function is not defined in C++AMP specification.
1397  tiled_index(const index<3>& g) restrict(amp, cpu) : global(g) {}
1398 
1399 private:
1400 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1401  tiled_index(int a0, int a1, int a2, int b0, int b1, int b2,
1402  int c0, int c1, int c2, tile_barrier& pb) restrict(amp,cpu)
1403  : global(a2, a1, a0), local(b2, b1, b0), tile(c2, c1, c0),
1404  tile_origin(a2 - b2, a1 - b1, a0 - b0), barrier(pb), tile_extent(D0, D1, D2) {}
1405 #endif
1406 
1407  __attribute__((annotate("__cxxamp_opencl_index")))
1408 #if __KALMAR_ACCELERATOR__ == 1
1409  __attribute__((always_inline)) tiled_index() restrict(amp)
1410  : global(index<3>(amp_get_global_id(2), amp_get_global_id(1), amp_get_global_id(0))),
1411  local(index<3>(amp_get_local_id(2), amp_get_local_id(1), amp_get_local_id(0))),
1412  tile(index<3>(amp_get_group_id(2), amp_get_group_id(1), amp_get_group_id(0))),
1413  tile_origin(index<3>(amp_get_global_id(2)-amp_get_local_id(2),
1414  amp_get_global_id(1)-amp_get_local_id(1),
1415  amp_get_global_id(0)-amp_get_local_id(0))),
1416  tile_extent(D0, D1, D2)
1417 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1418  __attribute__((always_inline)) tiled_index() restrict(amp, cpu)
1419 #else
1420  __attribute__((always_inline)) tiled_index() restrict(amp)
1421 #endif // __KALMAR_ACCELERATOR__
1422  {}
1423 
1424  template<int D0_, int D1_, int D2_, typename K> friend
1425  void parallel_for_each(const accelerator_view&, tiled_extent<D0_, D1_, D2_>, const K&);
1426 
1427 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1428  template<typename K, int D1_, int D2_, int D3_> friend
1429  void partitioned_task_tile(K const&, tiled_extent<D1_, D2_, D3_> const&, int);
1430 #endif
1431 };
1432 
1441 template <int D0>
1442 class tiled_index<D0, 0, 0> {
1443 public:
1448  static const int rank = 3;
1449 
1450  // FIXME: missing constructor:
1451  // tiled_index(const index<N>& global,
1452  // const index<N>& local,
1453  // const index<N>& tile,
1454  // const index<N>& tile_origin,
1455  // const tile_barrier& barrier) restrict(amp,cpu);
1456 
1464  tiled_index(const tiled_index<D0>& o) restrict(amp, cpu)
1465  : global(o.global), local(o.local), tile(o.tile), tile_origin(o.tile_origin), barrier(o.barrier) {}
1466 
1472 
1478 
1484 
1490 
1495 
1501  operator const index<1>() const restrict(amp,cpu) {
1502  return global;
1503  }
1504 
1519  Concurrency::extent<1> get_tile_extent() const restrict(amp, cpu) {
1520  return tile_extent;
1521  }
1523 
1530  static const int tile_dim0 = D0;
1531 
1534  // FIXME: this function is not defined in C++AMP specification.
1535  tiled_index(const index<1>& g) restrict(amp, cpu) : global(g) {}
1536 
1537 private:
1538 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1539  __attribute__((always_inline)) tiled_index(int a, int b, int c, tile_barrier& pb) restrict(amp, cpu)
1540  : global(a), local(b), tile(c), tile_origin(a - b), barrier(pb), tile_extent(D0) {}
1541 #endif
1542 
1543  __attribute__((annotate("__cxxamp_opencl_index")))
1544 #if __KALMAR_ACCELERATOR__ == 1
1545  __attribute__((always_inline)) tiled_index() restrict(amp)
1546  : global(index<1>(amp_get_global_id(0))),
1547  local(index<1>(amp_get_local_id(0))),
1548  tile(index<1>(amp_get_group_id(0))),
1549  tile_origin(index<1>(amp_get_global_id(0)-amp_get_local_id(0))),
1550  tile_extent(D0)
1551 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1552  __attribute__((always_inline)) tiled_index() restrict(amp,cpu)
1553 #else
1554  __attribute__((always_inline)) tiled_index() restrict(amp)
1555 #endif // __KALMAR_ACCELERATOR__
1556  {}
1557 
1558  template<int D, typename K> friend
1559  void parallel_for_each(const accelerator_view&, tiled_extent<D>, const K&);
1560 
1561 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1562  template<typename K, int D> friend
1563  void partitioned_task_tile(K const&, tiled_extent<D> const&, int);
1564 #endif
1565 };
1566 
1575 template <int D0, int D1>
1576 class tiled_index<D0, D1, 0> {
1577 public:
1582  static const int rank = 2;
1583 
1584  // FIXME: missing constructor:
1585  // tiled_index(const index<N>& global,
1586  // const index<N>& local,
1587  // const index<N>& tile,
1588  // const index<N>& tile_origin,
1589  // const tile_barrier& barrier) restrict(amp,cpu);
1590 
1598  tiled_index(const tiled_index<D0, D1>& o) restrict(amp, cpu)
1599  : global(o.global), local(o.local), tile(o.tile), tile_origin(o.tile_origin), barrier(o.barrier) {}
1600 
1606 
1612 
1618 
1624 
1629 
1635  operator const index<2>() const restrict(amp,cpu) {
1636  return global;
1637  }
1638 
1653  Concurrency::extent<2> get_tile_extent() const restrict(amp, cpu) {
1654  return tile_extent;
1655  }
1657 
1664  static const int tile_dim0 = D0;
1665  static const int tile_dim1 = D1;
1666 
1670  // FIXME: this function is not defined in C++AMP specification.
1671  tiled_index(const index<2>& g) restrict(amp, cpu) : global(g) {}
1672 
1673 private:
1674 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1675  tiled_index(int a0, int a1, int b0, int b1, int c0, int c1, tile_barrier& tbar) restrict(amp, cpu)
1676  : global(a1, a0), local(b1, b0), tile(c1, c0), tile_origin(a1 - b1, a0 - b0), barrier(tbar), tile_extent(D0, D1) {}
1677 #endif
1678 
1679  __attribute__((annotate("__cxxamp_opencl_index")))
1680 #if __KALMAR_ACCELERATOR__ == 1
1681  __attribute__((always_inline)) tiled_index() restrict(amp)
1682  : global(index<2>(amp_get_global_id(1), amp_get_global_id(0))),
1683  local(index<2>(amp_get_local_id(1), amp_get_local_id(0))),
1684  tile(index<2>(amp_get_group_id(1), amp_get_group_id(0))),
1685  tile_origin(index<2>(amp_get_global_id(1)-amp_get_local_id(1),
1686  amp_get_global_id(0)-amp_get_local_id(0))),
1687  tile_extent(D0, D1)
1688 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1689  __attribute__((always_inline)) tiled_index() restrict(amp,cpu)
1690 #else
1691  __attribute__((always_inline)) tiled_index() restrict(amp)
1692 #endif // __KALMAR_ACCELERATOR__
1693  {}
1694 
1695  template<int D0_, int D1_, typename K> friend
1696  void parallel_for_each(const accelerator_view&, tiled_extent<D0_, D1_>, const K&);
1697 
1698 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
1699  template<typename K, int D1_, int D2_> friend
1700  void partitioned_task_tile(K const&, tiled_extent<D1_, D2_> const&, int);
1701 #endif
1702 };
1703 
1704 // ------------------------------------------------------------------------
1705 // tiled_extent
1706 // ------------------------------------------------------------------------
1707 
1715 template <int D0, int D1/*=0*/, int D2/*=0*/>
1716 class tiled_extent : public extent<3>
1717 {
1718 public:
1719  static_assert(D0 > 0, "Tile size must be positive");
1720  static_assert(D1 > 0, "Tile size must be positive");
1721  static_assert(D2 > 0, "Tile size must be positive");
1722  static const int rank = 3;
1723 
1728  tiled_extent() restrict(amp,cpu) { }
1729 
1737  tiled_extent(const tiled_extent& other) restrict(amp,cpu): extent(other[0], other[1], other[2]) {}
1738 
1746  tiled_extent(const extent<3>& ext) restrict(amp,cpu): extent(ext) {}
1747 
1755  tiled_extent& operator=(const tiled_extent& other) restrict(amp,cpu);
1756 
1762  tiled_extent pad() const restrict(amp,cpu) {
1763  tiled_extent padded(*this);
1764  padded[0] = (padded[0] <= D0) ? D0 : (((padded[0] + D0 - 1) / D0) * D0);
1765  padded[1] = (padded[1] <= D1) ? D1 : (((padded[1] + D1 - 1) / D1) * D1);
1766  padded[2] = (padded[2] <= D2) ? D2 : (((padded[2] + D2 - 1) / D2) * D2);
1767  return padded;
1768  }
1769 
1775  tiled_extent truncate() const restrict(amp,cpu) {
1776  tiled_extent trunc(*this);
1777  trunc[0] = (trunc[0]/D0) * D0;
1778  trunc[1] = (trunc[1]/D1) * D1;
1779  trunc[2] = (trunc[2]/D2) * D2;
1780  return trunc;
1781  }
1782 
1795  // FIXME: this functions has not been implemented.
1796  extent<3> get_tile_extent() const;
1797 
1802  static const int tile_dim0 = D0;
1803  static const int tile_dim1 = D1;
1804  static const int tile_dim2 = D2;
1815  friend bool operator==(const tiled_extent& lhs, const tiled_extent& rhs) restrict(amp,cpu);
1816  friend bool operator!=(const tiled_extent& lhs, const tiled_extent& rhs) restrict(amp,cpu);
1817 
1819 };
1820 
1828 template <int D0, int D1>
1829 class tiled_extent<D0,D1,0> : public extent<2>
1830 {
1831 public:
1832  static_assert(D0 > 0, "Tile size must be positive");
1833  static_assert(D1 > 0, "Tile size must be positive");
1834  static const int rank = 2;
1835 
1840  tiled_extent() restrict(amp,cpu) { }
1841 
1849  tiled_extent(const tiled_extent& other) restrict(amp,cpu):extent(other[0], other[1]) {}
1850 
1858  tiled_extent(const extent<2>& ext) restrict(amp,cpu):extent(ext) {}
1859 
1867  tiled_extent& operator=(const tiled_extent& other) restrict(amp,cpu);
1868 
1874  tiled_extent pad() const restrict(amp,cpu) {
1875  tiled_extent padded(*this);
1876  padded[0] = (padded[0] <= D0) ? D0 : (((padded[0] + D0 - 1) / D0) * D0);
1877  padded[1] = (padded[1] <= D1) ? D1 : (((padded[1] + D1 - 1) / D1) * D1);
1878  return padded;
1879  }
1880 
1886  tiled_extent truncate() const restrict(amp,cpu) {
1887  tiled_extent trunc(*this);
1888  trunc[0] = (trunc[0]/D0) * D0;
1889  trunc[1] = (trunc[1]/D1) * D1;
1890  return trunc;
1891  }
1892 
1905  // FIXME: this functions has not been implemented.
1906  extent<2> get_tile_extent() const;
1907 
1912  static const int tile_dim0 = D0;
1913  static const int tile_dim1 = D1;
1924  friend bool operator==(const tiled_extent& lhs, const tiled_extent& rhs) restrict(amp,cpu);
1925  friend bool operator!=(const tiled_extent& lhs, const tiled_extent& rhs) restrict(amp,cpu);
1926 
1928 };
1929 
1937 template <int D0>
1938 class tiled_extent<D0,0,0> : public extent<1>
1939 {
1940 public:
1941  static_assert(D0 > 0, "Tile size must be positive");
1942  static const int rank = 1;
1943 
1948  tiled_extent() restrict(amp,cpu) { }
1949 
1957  tiled_extent(const tiled_extent& other) restrict(amp,cpu): extent(other[0]) {}
1958 
1966  tiled_extent(const extent<1>& ext) restrict(amp,cpu):extent(ext) {}
1967 
1975  tiled_extent& operator=(const tiled_extent& other) restrict(amp,cpu);
1976 
1982  tiled_extent pad() const restrict(amp,cpu) {
1983  tiled_extent padded(*this);
1984  padded[0] = (padded[0] <= D0) ? D0 : (((padded[0] + D0 - 1) / D0) * D0);
1985  return padded;
1986  }
1987 
1993  tiled_extent truncate() const restrict(amp,cpu) {
1994  tiled_extent trunc(*this);
1995  trunc[0] = (trunc[0]/D0) * D0;
1996  return trunc;
1997  }
1998 
2011  // FIXME: this functions has not been implemented.
2012  extent<1> get_tile_extent() const;
2013 
2018  static const int tile_dim0 = D0;
2030  friend bool operator==(const tiled_extent& lhs, const tiled_extent& rhs) restrict(amp,cpu);
2031  friend bool operator!=(const tiled_extent& lhs, const tiled_extent& rhs) restrict(amp,cpu);
2032 
2034 };
2035 
2036 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
2037 #define SSIZE 1024 * 10
2038 template <int N, typename Kernel, int K>
2039 struct cpu_helper
2040 {
2041  static inline void call(const Kernel& k, index<K>& idx, const extent<K>& ext) restrict(amp,cpu) {
2042  int i;
2043  for (i = 0; i < ext[N]; ++i) {
2044  idx[N] = i;
2045  cpu_helper<N + 1, Kernel, K>::call(k, idx, ext);
2046  }
2047  }
2048 };
2049 template <typename Kernel, int K>
2050 struct cpu_helper<K, Kernel, K>
2051 {
2052  static inline void call(const Kernel& k, const index<K>& idx, const extent<K>& ext) restrict(amp,cpu) {
2053  (const_cast<Kernel&>(k))(idx);
2054  }
2055 };
2056 
2057 template <typename Kernel, int N>
2058 void partitioned_task(const Kernel& ker, const extent<N>& ext, int part) {
2059  index<N> idx;
2060  int start = ext[0] * part / Kalmar::NTHREAD;
2061  int end = ext[0] * (part + 1) / Kalmar::NTHREAD;
2062  for (int i = start; i < end; i++) {
2063  idx[0] = i;
2064  cpu_helper<1, Kernel, N>::call(ker, idx, ext);
2065  }
2066 }
2067 
2068 template <typename Kernel, int D0>
2069 void partitioned_task_tile(Kernel const& f, tiled_extent<D0> const& ext, int part) {
2070  int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
2071  int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
2072  int stride = end - start;
2073  if (stride == 0)
2074  return;
2075  char *stk = new char[D0 * SSIZE];
2076  tiled_index<D0> *tidx = new tiled_index<D0>[D0];
2077  tile_barrier::pb_t amp_bar = std::make_shared<barrier_t>(D0);
2078  tile_barrier tbar(amp_bar);
2079  for (int tx = start; tx < end; tx++) {
2080  int id = 0;
2081  char *sp = stk;
2082  tiled_index<D0> *tip = tidx;
2083  for (int x = 0; x < D0; x++) {
2084  new (tip) tiled_index<D0>(tx * D0 + x, x, tx, tbar);
2085  amp_bar->setctx(++id, sp, f, tip, SSIZE);
2086  sp += SSIZE;
2087  ++tip;
2088  }
2089  amp_bar->idx = 0;
2090  while (amp_bar->idx == 0) {
2091  amp_bar->idx = id;
2092  amp_bar->swap(0, id);
2093  }
2094  }
2095  delete [] stk;
2096  delete [] tidx;
2097 }
2098 template <typename Kernel, int D0, int D1>
2099 void partitioned_task_tile(Kernel const& f, tiled_extent<D0, D1> const& ext, int part) {
2100  int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
2101  int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
2102  int stride = end - start;
2103  if (stride == 0)
2104  return;
2105  char *stk = new char[D1 * D0 * SSIZE];
2106  tiled_index<D0, D1> *tidx = new tiled_index<D0, D1>[D0 * D1];
2107  tile_barrier::pb_t amp_bar = std::make_shared<barrier_t>(D0 * D1);
2108  tile_barrier tbar(amp_bar);
2109 
2110  for (int tx = 0; tx < ext[1] / D1; tx++)
2111  for (int ty = start; ty < end; ty++) {
2112  int id = 0;
2113  char *sp = stk;
2114  tiled_index<D0, D1> *tip = tidx;
2115  for (int x = 0; x < D1; x++)
2116  for (int y = 0; y < D0; y++) {
2117  new (tip) tiled_index<D0, D1>(D1 * tx + x, D0 * ty + y, x, y, tx, ty, tbar);
2118  amp_bar->setctx(++id, sp, f, tip, SSIZE);
2119  ++tip;
2120  sp += SSIZE;
2121  }
2122  amp_bar->idx = 0;
2123  while (amp_bar->idx == 0) {
2124  amp_bar->idx = id;
2125  amp_bar->swap(0, id);
2126  }
2127  }
2128  delete [] stk;
2129  delete [] tidx;
2130 }
2131 
2132 template <typename Kernel, int D0, int D1, int D2>
2133 void partitioned_task_tile(Kernel const& f, tiled_extent<D0, D1, D2> const& ext, int part) {
2134  int start = (ext[0] / D0) * part / Kalmar::NTHREAD;
2135  int end = (ext[0] / D0) * (part + 1) / Kalmar::NTHREAD;
2136  int stride = end - start;
2137  if (stride == 0)
2138  return;
2139  char *stk = new char[D2 * D1 * D0 * SSIZE];
2140  tiled_index<D0, D1, D2> *tidx = new tiled_index<D0, D1, D2>[D0 * D1 * D2];
2141  tile_barrier::pb_t amp_bar = std::make_shared<barrier_t>(D0 * D1 * D2);
2142  tile_barrier tbar(amp_bar);
2143 
2144  for (int i = 0; i < ext[2] / D2; i++)
2145  for (int j = 0; j < ext[1] / D1; j++)
2146  for(int k = start; k < end; k++) {
2147  int id = 0;
2148  char *sp = stk;
2149  tiled_index<D0, D1, D2> *tip = tidx;
2150  for (int x = 0; x < D2; x++)
2151  for (int y = 0; y < D1; y++)
2152  for (int z = 0; z < D0; z++) {
2153  new (tip) tiled_index<D0, D1, D2>(D2 * i + x,
2154  D1 * j + y,
2155  D0 * k + z,
2156  x, y, z, i, j, k, tbar);
2157  amp_bar->setctx(++id, sp, f, tip, SSIZE);
2158  ++tip;
2159  sp += SSIZE;
2160  }
2161  amp_bar->idx = 0;
2162  while (amp_bar->idx == 0) {
2163  amp_bar->idx = id;
2164  amp_bar->swap(0, id);
2165  }
2166  }
2167  delete [] stk;
2168  delete [] tidx;
2169 }
2170 
2171 template <typename Kernel, int N>
2172 void launch_cpu_task(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
2173  extent<N> const& compute_domain)
2174 {
2175  Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
2176  for (int i = 0; i < Kalmar::NTHREAD; ++i)
2177  obj[i] = std::thread(partitioned_task<Kernel, N>, std::cref(f), std::cref(compute_domain), i);
2178 }
2179 
2180 template <typename Kernel, int D0>
2181 void launch_cpu_task(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
2182  tiled_extent<D0> const& compute_domain)
2183 {
2184  Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
2185  for (int i = 0; i < Kalmar::NTHREAD; ++i)
2186  obj[i] = std::thread(partitioned_task_tile<Kernel, D0>,
2187  std::cref(f), std::cref(compute_domain), i);
2188 }
2189 
2190 template <typename Kernel, int D0, int D1>
2191 void launch_cpu_task(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
2192  tiled_extent<D0, D1> const& compute_domain)
2193 {
2194  Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
2195  for (int i = 0; i < Kalmar::NTHREAD; ++i)
2196  obj[i] = std::thread(partitioned_task_tile<Kernel, D0, D1>,
2197  std::cref(f), std::cref(compute_domain), i);
2198 }
2199 
2200 template <typename Kernel, int D0, int D1, int D2>
2201 void launch_cpu_task(const std::shared_ptr<Kalmar::KalmarQueue>& pQueue, Kernel const& f,
2202  tiled_extent<D0, D1, D2> const& compute_domain)
2203 {
2204  Kalmar::CPUKernelRAII<Kernel> obj(pQueue, f);
2205  for (int i = 0; i < Kalmar::NTHREAD; ++i)
2206  obj[i] = std::thread(partitioned_task_tile<Kernel, D0, D1, D2>,
2207  std::cref(f), std::cref(compute_domain), i);
2208 }
2209 
2210 #endif
2211 
2212 // ------------------------------------------------------------------------
2213 // utility helper classes for array_view
2214 // ------------------------------------------------------------------------
2215 
2216 template <typename T, int N>
2218 {
2219  // array_view<T,N>, where N>1
2220  // array_view<T,N-1> operator[](int i) const restrict(amp,cpu)
2221  static_assert(N > 1, "projection_helper is only supported on array_view with a rank of 2 or higher");
2222  typedef array_view<T, N - 1> result_type;
2223  static result_type project(array_view<T, N>& now, int stride) restrict(amp,cpu) {
2224  int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
2225  for (i = N - 1; i > 0; --i) {
2226  ext_o[i - 1] = now.extent[i];
2227  ext[i - 1] = now.extent_base[i];
2228  idx[i - 1] = now.index_base[i];
2229  }
2230  stride += now.index_base[0];
2231  Concurrency::extent<N - 1> ext_now(ext_o);
2232  Concurrency::extent<N - 1> ext_base(ext);
2233  Concurrency::index<N - 1> idx_base(idx);
2234  return result_type (now.cache, ext_now, ext_base, idx_base,
2235  now.offset + ext_base.size() * stride);
2236  }
2237  static result_type project(const array_view<T, N>& now, int stride) restrict(amp,cpu) {
2238  int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
2239  for (i = N - 1; i > 0; --i) {
2240  ext_o[i - 1] = now.extent[i];
2241  ext[i - 1] = now.extent_base[i];
2242  idx[i - 1] = now.index_base[i];
2243  }
2244  stride += now.index_base[0];
2245  Concurrency::extent<N - 1> ext_now(ext_o);
2246  Concurrency::extent<N - 1> ext_base(ext);
2247  Concurrency::index<N - 1> idx_base(idx);
2248  return result_type (now.cache, ext_now, ext_base, idx_base,
2249  now.offset + ext_base.size() * stride);
2250  }
2251 };
2252 template <typename T>
2253 struct projection_helper<T, 1>
2254 {
2255  // array_view<T,1>
2256  // T& operator[](int i) const restrict(amp,cpu);
2257  typedef T& result_type;
2258  static result_type project(array_view<T, 1>& now, int i) restrict(amp,cpu) {
2259 #if __KALMAR_ACCELERATOR__ != 1
2260  now.cache.get_cpu_access(true);
2261 #endif
2262  T *ptr = reinterpret_cast<T *>(now.cache.get() + i + now.offset + now.index_base[0]);
2263  return *ptr;
2264  }
2265  static result_type project(const array_view<T, 1>& now, int i) restrict(amp,cpu) {
2266 #if __KALMAR_ACCELERATOR__ != 1
2267  now.cache.get_cpu_access(true);
2268 #endif
2269  T *ptr = reinterpret_cast<T *>(now.cache.get() + i + now.offset + now.index_base[0]);
2270  return *ptr;
2271  }
2272 };
2273 template <typename T, int N>
2274 struct projection_helper<const T, N>
2275 {
2276  // array_view<T,N>, where N>1
2277  // array_view<const T,N-1> operator[](int i) const restrict(amp,cpu);
2278  static_assert(N > 1, "projection_helper is only supported on array_view with a rank of 2 or higher");
2279  typedef array_view<const T, N - 1> const_result_type;
2280  static const_result_type project(array_view<const T, N>& now, int stride) restrict(amp,cpu) {
2281  int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
2282  for (i = N - 1; i > 0; --i) {
2283  ext_o[i - 1] = now.extent[i];
2284  ext[i - 1] = now.extent_base[i];
2285  idx[i - 1] = now.index_base[i];
2286  }
2287  stride += now.index_base[0];
2288  Concurrency::extent<N - 1> ext_now(ext_o);
2289  Concurrency::extent<N - 1> ext_base(ext);
2290  Concurrency::index<N - 1> idx_base(idx);
2291  auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
2292  now.offset + ext_base.size() * stride);
2293  return ret;
2294  }
2295  static const_result_type project(const array_view<const T, N>& now, int stride) restrict(amp,cpu) {
2296  int ext[N - 1], i, idx[N - 1], ext_o[N - 1];
2297  for (i = N - 1; i > 0; --i) {
2298  ext_o[i - 1] = now.extent[i];
2299  ext[i - 1] = now.extent_base[i];
2300  idx[i - 1] = now.index_base[i];
2301  }
2302  stride += now.index_base[0];
2303  Concurrency::extent<N - 1> ext_now(ext_o);
2304  Concurrency::extent<N - 1> ext_base(ext);
2305  Concurrency::index<N - 1> idx_base(idx);
2306  auto ret = const_result_type (now.cache, ext_now, ext_base, idx_base,
2307  now.offset + ext_base.size() * stride);
2308  return ret;
2309  }
2310 };
2311 template <typename T>
2312 struct projection_helper<const T, 1>
2313 {
2314  // array_view<const T,1>
2315  // const T& operator[](int i) const restrict(amp,cpu);
2316  typedef const T& const_result_type;
2317  static const_result_type project(array_view<const T, 1>& now, int i) restrict(amp,cpu) {
2318 #if __KALMAR_ACCELERATOR__ != 1
2319  now.cache.get_cpu_access();
2320 #endif
2321  const T *ptr = reinterpret_cast<const T *>(now.cache.get() + i + now.offset + now.index_base[0]);
2322  return *ptr;
2323  }
2324  static const_result_type project(const array_view<const T, 1>& now, int i) restrict(amp,cpu) {
2325 #if __KALMAR_ACCELERATOR__ != 1
2326  now.cache.get_cpu_access();
2327 #endif
2328  const T *ptr = reinterpret_cast<const T *>(now.cache.get() + i + now.offset + now.index_base[0]);
2329  return *ptr;
2330  }
2331 };
2332 
2333 // ------------------------------------------------------------------------
2334 // utility helper classes for array
2335 // ------------------------------------------------------------------------
2336 
2337 template <typename T, int N>
2339 {
2340  // array<T,N>, where N>1
2341  // array_view<T,N-1> operator[](int i0) restrict(amp,cpu);
2342  // array_view<const T,N-1> operator[](int i0) const restrict(amp,cpu);
2343  static_assert(N > 1, "projection_helper is only supported on array with a rank of 2 or higher");
2344  typedef array_view<T, N - 1> result_type;
2345  typedef array_view<const T, N - 1> const_result_type;
2346  static result_type project(array<T, N>& now, int stride) restrict(amp,cpu) {
2347 #if __KALMAR_ACCELERATOR__ != 1
2348  if( stride < 0)
2349  throw runtime_exception("errorMsg_throw", 0);
2350 #endif
2351  int comp[N - 1], i;
2352  for (i = N - 1; i > 0; --i)
2353  comp[i - 1] = now.extent[i];
2354  Concurrency::extent<N - 1> ext(comp);
2355  int offset = ext.size() * stride;
2356 #if __KALMAR_ACCELERATOR__ != 1
2357  if( offset >= now.extent.size())
2358  throw runtime_exception("errorMsg_throw", 0);
2359 #endif
2360  return result_type(now.m_device, ext, ext, index<N - 1>(), offset);
2361  }
2362  static const_result_type project(const array<T, N>& now, int stride) restrict(amp,cpu) {
2363  int comp[N - 1], i;
2364  for (i = N - 1; i > 0; --i)
2365  comp[i - 1] = now.extent[i];
2366  Concurrency::extent<N - 1> ext(comp);
2367  int offset = ext.size() * stride;
2368  return const_result_type(now.m_device, ext, ext, index<N - 1>(), offset);
2369  }
2370 };
2371 template <typename T>
2373 {
2374  // array<T,1>
2375  // T& operator[](int i0) restrict(amp,cpu);
2376  // const T& operator[](int i0) const restrict(amp,cpu);
2377  typedef T& result_type;
2378  typedef const T& const_result_type;
2379  static result_type project(array<T, 1>& now, int i) restrict(amp,cpu) {
2380 #if __KALMAR_ACCELERATOR__ != 1
2381  now.m_device.synchronize(true);
2382 #endif
2383  T *ptr = reinterpret_cast<T *>(now.m_device.get() + i);
2384  return *ptr;
2385  }
2386  static const_result_type project(const array<T, 1>& now, int i) restrict(amp,cpu) {
2387 #if __KALMAR_ACCELERATOR__ != 1
2388  now.m_device.synchronize();
2389 #endif
2390  const T *ptr = reinterpret_cast<const T *>(now.m_device.get() + i);
2391  return *ptr;
2392  }
2393 };
2394 
2395 template <int N>
2396 const Concurrency::extent<N>& check(const Concurrency::extent<N>& ext)
2397 {
2398 #if __KALMAR_ACCELERATOR__ != 1
2399  for (int i = 0; i < N; i++)
2400  {
2401  if(ext[i] <=0)
2402  throw runtime_exception("errorMsg_throw", 0);
2403  }
2404 #endif
2405  return ext;
2406 }
2407 
2408 // ------------------------------------------------------------------------
2409 // forward declarations of copy routines used by array / array_view
2410 // ------------------------------------------------------------------------
2411 
2412 template <typename T, int N>
2413 void copy(const array_view<const T, N>& src, const array_view<T, N>& dest);
2414 
2415 template <typename T, int N>
2416 void copy(const array_view<T, N>& src, const array_view<T, N>& dest);
2417 
2418 template <typename T, int N>
2419 void copy(const array<T, N>& src, const array_view<T, N>& dest);
2420 
2421 template <typename T, int N>
2422 void copy(const array<T, N>& src, array<T, N>& dest);
2423 
2424 template <typename T, int N>
2425 void copy(const array_view<const T, N>& src, array<T, N>& dest);
2426 
2427 template <typename T, int N>
2428 void copy(const array_view<T, N>& src, array<T, N>& dest);
2429 
2430 template <typename InputIter, typename T, int N>
2431 void copy(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest);
2432 
2433 template <typename InputIter, typename T, int N>
2434 void copy(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest);
2435 
2436 template <typename InputIter, typename T, int N>
2437 void copy(InputIter srcBegin, const array_view<T, N>& dest);
2438 
2439 template <typename InputIter, typename T, int N>
2440 void copy(InputIter srcBegin, array<T, N>& dest);
2441 
2442 template <typename OutputIter, typename T, int N>
2443 void copy(const array_view<T, N> &src, OutputIter destBegin);
2444 
2445 template <typename OutputIter, typename T, int N>
2446 void copy(const array<T, N> &src, OutputIter destBegin);
2447 
2448 // ------------------------------------------------------------------------
2449 // array
2450 // ------------------------------------------------------------------------
2451 
2459 template <typename T, int N = 1>
2460 class array {
2461  static_assert(!std::is_const<T>::value, "array<const T> is not supported");
2462  static_assert(0 == (sizeof(T) % sizeof(int)), "only value types whose size is a multiple of the size of an integer are allowed in array");
2463 public:
2464 #if __KALMAR_ACCELERATOR__ == 1
2465  typedef Kalmar::_data<T> acc_buffer_t;
2466 #else
2467  typedef Kalmar::_data_host<T> acc_buffer_t;
2468 #endif
2469 
2473  static const int rank = N;
2474 
2478  typedef T value_type;
2479 
2483  array() = delete;
2484 
2493  array(const array& other)
2494  : array(other.get_extent(), other.get_accelerator_view())
2495  { Concurrency::copy(other, *this); }
2496 
2504  array(array&& other)
2505  : m_device(other.m_device), extent(other.extent)
2506  { other.m_device.reset(); }
2507 
2515  explicit array(const extent<N>& ext)
2516  : array(ext, accelerator(L"default").get_default_view()) {}
2517 
2525  explicit array(int e0)
2526  : array(Concurrency::extent<N>(e0)) { static_assert(N == 1, "illegal"); }
2527  explicit array(int e0, int e1)
2528  : array(Concurrency::extent<N>(e0, e1)) {}
2529  explicit array(int e0, int e1, int e2)
2530  : array(Concurrency::extent<N>(e0, e1, e2)) {}
2531 
2548  template <typename InputIter>
2549  array(const Concurrency::extent<N>& ext, InputIter srcBegin)
2550  : array(ext, srcBegin, accelerator(L"default").get_default_view()) {}
2551  template <typename InputIter>
2552  array(const Concurrency::extent<N>& ext, InputIter srcBegin, InputIter srcEnd)
2553  : array(ext, srcBegin, srcEnd, accelerator(L"default").get_default_view()) {}
2554 
2567  template <typename InputIter>
2568  array(int e0, InputIter srcBegin)
2569  : array(Concurrency::extent<N>(e0), srcBegin) {}
2570  template <typename InputIter>
2571  array(int e0, InputIter srcBegin, InputIter srcEnd)
2572  : array(Concurrency::extent<N>(e0), srcBegin, srcEnd) {}
2573  template <typename InputIter>
2574  array(int e0, int e1, InputIter srcBegin)
2575  : array(Concurrency::extent<N>(e0, e1), srcBegin) {}
2576  template <typename InputIter>
2577  array(int e0, int e1, InputIter srcBegin, InputIter srcEnd)
2578  : array(Concurrency::extent<N>(e0, e1), srcBegin, srcEnd) {}
2579  template <typename InputIter>
2580  array(int e0, int e1, int e2, InputIter srcBegin)
2581  : array(Concurrency::extent<N>(e0, e1, e2), srcBegin) {}
2582  template <typename InputIter>
2583  array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd)
2584  : array(Concurrency::extent<N>(e0, e1, e2), srcBegin, srcEnd) {}
2585 
2599  explicit array(const array_view<const T, N>& src)
2600  : array(src.get_extent(), accelerator(L"default").get_default_view())
2601  { Concurrency::copy(src, *this); }
2602 
2625  array(const extent<N>& ext, accelerator_view av, access_type cpu_access_type = access_type_auto)
2626 #if __KALMAR_ACCELERATOR__ == 1
2627  : m_device(ext.size()), extent(ext) {}
2628 #else
2629  : m_device(av.pQueue, av.pQueue, check(ext).size(), cpu_access_type), extent(ext) {}
2630 #endif
2631 
2643  array(int e0, accelerator_view av, access_type cpu_access_type = access_type_auto)
2644  : array(Concurrency::extent<N>(e0), av, cpu_access_type) {}
2645  array(int e0, int e1, accelerator_view av, access_type cpu_access_type = access_type_auto)
2646  : array(Concurrency::extent<N>(e0, e1), av, cpu_access_type) {}
2647  array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type = access_type_auto)
2648  : array(Concurrency::extent<N>(e0, e1, e2), av, cpu_access_type) {}
2649 
2679  template <typename InputIter>
2680  array(const Concurrency::extent<N>& ext, InputIter srcBegin, accelerator_view av,
2681  access_type cpu_access_type = access_type_auto)
2682  : array(ext, av, cpu_access_type) { Concurrency::copy(srcBegin, *this); }
2683  template <typename InputIter>
2684  array(const Concurrency::extent<N>& ext, InputIter srcBegin, InputIter srcEnd,
2685  accelerator_view av, access_type cpu_access_type = access_type_auto)
2686  : array(ext, av, cpu_access_type) {
2687  if(ext.size() < std::distance(srcBegin, srcEnd))
2688  throw runtime_exception("errorMsg_throw", 0);
2689  Concurrency::copy(srcBegin, srcEnd, *this);
2690  }
2691 
2720  array(const array_view<const T, N>& src, accelerator_view av, access_type cpu_access_type = access_type_auto)
2721  : array(src.get_extent(), av, cpu_access_type) { Concurrency::copy(src, *this); }
2722 
2736  template <typename InputIter>
2737  array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto)
2738  : array(Concurrency::extent<N>(e0), srcBegin, av, cpu_access_type) {}
2739  template <typename InputIter>
2740  array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto)
2741  : array(Concurrency::extent<N>(e0), srcBegin, srcEnd, av, cpu_access_type) {}
2742  template <typename InputIter>
2743  array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto)
2744  : array(Concurrency::extent<N>(e0, e1), srcBegin, av, cpu_access_type) {}
2745  template <typename InputIter>
2746  array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto)
2747  : array(Concurrency::extent<N>(e0, e1), srcBegin, srcEnd, av, cpu_access_type) {}
2748  template <typename InputIter>
2749  array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type = access_type_auto)
2750  : array(Concurrency::extent<N>(e0, e1, e2), srcBegin, av, cpu_access_type) {}
2751  template <typename InputIter>
2752  array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type = access_type_auto)
2753  : array(Concurrency::extent<N>(e0, e1, e2), srcBegin, srcEnd, av, cpu_access_type) {}
2754 
2770 #if __KALMAR_ACCELERATOR__ == 1
2771  : m_device(ext.size()), extent(ext) {}
2772 #else
2773  : m_device(av.pQueue, associated_av.pQueue, check(ext).size(), access_type_auto), extent(ext) {}
2774 #endif
2775 
2788  array(int e0, accelerator_view av, accelerator_view associated_av)
2789  : array(Concurrency::extent<N>(e0), av, associated_av) {}
2790  array(int e0, int e1, accelerator_view av, accelerator_view associated_av)
2791  : array(Concurrency::extent<N>(e0, e1), av, associated_av) {}
2792  array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av)
2793  : array(Concurrency::extent<N>(e0, e1, e2), av, associated_av) {}
2794 
2812  template <typename InputIter>
2813  array(const Concurrency::extent<N>& ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
2814  : array(ext, av, associated_av) { Concurrency::copy(srcBegin, *this); }
2815  template <typename InputIter>
2816  array(const Concurrency::extent<N>& ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
2817  : array(ext, av, associated_av) {
2818  if(ext.size() < std::distance(srcBegin, srcEnd))
2819  throw runtime_exception("errorMsg_throw", 0);
2820  Concurrency::copy(srcBegin, srcEnd, *this);
2821  }
2822 
2842  : array(src.get_extent(), av, associated_av)
2843  { Concurrency::copy(src, *this); }
2844 
2859  template <typename InputIter>
2860  array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
2861  : array(Concurrency::extent<N>(e0), srcBegin, av, associated_av) {}
2862  template <typename InputIter>
2863  array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
2864  : array(Concurrency::extent<N>(e0), srcBegin, srcEnd, av, associated_av) {}
2865  template <typename InputIter>
2866  array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
2867  : array(Concurrency::extent<N>(e0, e1), srcBegin, av, associated_av) {}
2868  template <typename InputIter>
2869  array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
2870  : array(Concurrency::extent<N>(e0, e1), srcBegin, srcEnd, av, associated_av) {}
2871  template <typename InputIter>
2872  array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
2873  : array(Concurrency::extent<N>(e0, e1, e2), srcBegin, av, associated_av) {}
2874  template <typename InputIter>
2875  array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
2876  : array(Concurrency::extent<N>(e0, e1, e2), srcBegin, srcEnd, av, associated_av) {}
2877 
2883  Concurrency::extent<N> get_extent() const restrict(amp,cpu) { return extent; }
2884 
2889  accelerator_view get_accelerator_view() const { return m_device.get_av(); }
2890 
2895  accelerator_view get_associated_accelerator_view() const { return m_device.get_stage(); }
2896 
2900  access_type get_cpu_access_type() const { return m_device.get_access(); }
2901 
2910  array& operator=(const array& other) {
2911  if (this != &other) {
2912  array arr(other);
2913  *this = std::move(arr);
2914  }
2915  return *this;
2916  }
2917 
2925  array& operator=(array&& other) {
2926  if (this != &other) {
2927  extent = other.extent;
2928  m_device = other.m_device;
2929  other.m_device.reset();
2930  }
2931  return *this;
2932  }
2933 
2943  array arr(src);
2944  *this = std::move(arr);
2945  return *this;
2946  }
2947 
2955  // FIXME: const is not defined in C++ AMP specification
2956  void copy_to(array& dest) const {
2957 #if __KALMAR_ACCELERATOR__ != 1
2958  for(int i = 0 ; i < N ; i++)
2959  {
2960  if(dest.extent[i] < this->extent[i] )
2961  throw runtime_exception("errorMsg_throw", 0);
2962  }
2963 #endif
2964  copy(*this, dest);
2965  }
2966 
2974  // FIXME: const is not defined in C++ AMP specification
2975  void copy_to(const array_view<T,N>& dest) const { copy(*this, dest); }
2976 
2982  // FIXME: const is not defined in C++ AMP specification
2983  // FIXME: missing const T* data() const
2984  T* data() const restrict(amp,cpu) {
2985 #if __KALMAR_ACCELERATOR__ != 1
2986  if (!m_device.get())
2987  return nullptr;
2988  m_device.synchronize(true);
2989 #endif
2990  return reinterpret_cast<T*>(m_device.get());
2991  }
2992 
3000  operator std::vector<T>() const {
3001  std::vector<T> vec(extent.size());
3002  Concurrency::copy(*this, vec.data());
3003  return std::move(vec);
3004  }
3005 
3017  T& operator[](const index<N>& idx) restrict(amp,cpu) {
3018 #ifndef __KALMAR_ACCELERATOR__
3019  if (!m_device.get())
3020  throw runtime_exception("The array is not accessible on CPU.", 0);
3021  m_device.synchronize(true);
3022 #endif
3023  T *ptr = reinterpret_cast<T*>(m_device.get());
3024  return ptr[Kalmar::amp_helper<N, index<N>, Concurrency::extent<N>>::flatten(idx, extent)];
3025  }
3026  T& operator()(const index<N>& idx) restrict(amp,cpu) {
3027  return (*this)[idx];
3028  }
3029 
3043  const T& operator[](const index<N>& idx) const restrict(amp,cpu) {
3044 #if __KALMAR_ACCELERATOR__ != 1
3045  if (!m_device.get())
3046  throw runtime_exception("The array is not accessible on CPU.", 0);
3047  m_device.synchronize();
3048 #endif
3049  T *ptr = reinterpret_cast<T*>(m_device.get());
3050  return ptr[Kalmar::amp_helper<N, index<N>, Concurrency::extent<N>>::flatten(idx, extent)];
3051  }
3052  const T& operator()(const index<N>& idx) const restrict(amp,cpu) {
3053  return (*this)[idx];
3054  }
3055 
3066  T& operator()(int i0, int i1) restrict(amp,cpu) {
3067  return (*this)[index<2>(i0, i1)];
3068  }
3069  T& operator()(int i0, int i1, int i2) restrict(amp,cpu) {
3070  return (*this)[index<3>(i0, i1, i2)];
3071  }
3072 
3083  const T& operator()(int i0, int i1) const restrict(amp,cpu) {
3084  return (*this)[index<2>(i0, i1)];
3085  }
3086  const T& operator()(int i0, int i1, int i2) const restrict(amp,cpu) {
3087  return (*this)[index<3>(i0, i1, i2)];
3088  }
3089 
3110  operator[] (int i) restrict(amp,cpu) {
3111  return array_projection_helper<T, N>::project(*this, i);
3112  }
3114  operator()(int i0) restrict(amp,cpu) {
3115  return (*this)[i0];
3116  }
3118  operator[] (int i) const restrict(amp,cpu) {
3119  return array_projection_helper<T, N>::project(*this, i);
3120  }
3122  operator()(int i0) const restrict(amp,cpu) {
3123  return (*this)[i0];
3124  }
3125 
3146  array_view<T, N> section(const Concurrency::index<N>& origin, const Concurrency::extent<N>& ext) restrict(amp,cpu) {
3147 #if __KALMAR_ACCELERATOR__ != 1
3148  if( !Kalmar::amp_helper<N, index<N>, Concurrency::extent<N>>::contains(origin, ext ,this->extent) )
3149  throw runtime_exception("errorMsg_throw", 0);
3150 #endif
3151  array_view<T, N> av(*this);
3152  return av.section(origin, ext);
3153  }
3154  array_view<const T, N> section(const Concurrency::index<N>& origin, const Concurrency::extent<N>& ext) const restrict(amp,cpu) {
3155  array_view<const T, N> av(*this);
3156  return av.section(origin, ext);
3157  }
3158 
3165  array_view<T, N> section(const index<N>& idx) restrict(amp,cpu) {
3166 #if __KALMAR_ACCELERATOR__ != 1
3167  if( !Kalmar::amp_helper<N, index<N>, Concurrency::extent<N>>::contains(idx, this->extent ) )
3168  throw runtime_exception("errorMsg_throw", 0);
3169 #endif
3170  array_view<T, N> av(*this);
3171  return av.section(idx);
3172  }
3173  array_view<const T, N> section(const index<N>& idx) const restrict(amp,cpu) {
3174  array_view<const T, N> av(*this);
3175  return av.section(idx);
3176  }
3177 
3184  array_view<T,N> section(const extent<N>& ext) restrict(amp,cpu) {
3185  array_view<T, N> av(*this);
3186  return av.section(ext);
3187  }
3188  array_view<const T,N> section(const extent<N>& ext) const restrict(amp,cpu) {
3189  array_view<const T, N> av(*this);
3190  return av.section(ext);
3191  }
3192 
3205  array_view<T, 1> section(int i0, int e0) restrict(amp,cpu) {
3206  static_assert(N == 1, "Rank must be 1");
3207  return section(Concurrency::index<1>(i0), Concurrency::extent<1>(e0));
3208  }
3209  array_view<const T, 1> section(int i0, int e0) const restrict(amp,cpu) {
3210  static_assert(N == 1, "Rank must be 1");
3211  return section(Concurrency::index<1>(i0), Concurrency::extent<1>(e0));
3212  }
3213  array_view<T, 2> section(int i0, int i1, int e0, int e1) const restrict(amp,cpu) {
3214  static_assert(N == 2, "Rank must be 2");
3215  return section(Concurrency::index<2>(i0, i1), Concurrency::extent<2>(e0, e1));
3216  }
3217  array_view<T, 2> section(int i0, int i1, int e0, int e1) restrict(amp,cpu) {
3218  static_assert(N == 2, "Rank must be 2");
3219  return section(Concurrency::index<2>(i0, i1), Concurrency::extent<2>(e0, e1));
3220  }
3221  array_view<T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) restrict(amp,cpu) {
3222  static_assert(N == 3, "Rank must be 3");
3223  return section(Concurrency::index<3>(i0, i1, i2), Concurrency::extent<3>(e0, e1, e2));
3224  }
3225  array_view<const T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) const restrict(amp,cpu) {
3226  static_assert(N == 3, "Rank must be 3");
3227  return section(Concurrency::index<3>(i0, i1, i2), Concurrency::extent<3>(e0, e1, e2));
3228  }
3229 
3253  template <typename ElementType>
3255 #if __KALMAR_ACCELERATOR__ != 1
3256  static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
3257  static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
3258  if( (extent.size() * sizeof(T)) % sizeof(ElementType))
3259  throw runtime_exception("errorMsg_throw", 0);
3260 #endif
3261  int size = extent.size() * sizeof(T) / sizeof(ElementType);
3262  using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
3263  array_view<ElementType, 1> av(buffer_type(m_device), Concurrency::extent<1>(size), 0);
3264  return av;
3265  }
3266  template <typename ElementType>
3268 #if __KALMAR_ACCELERATOR__ != 1
3269  static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
3270  static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
3271 #endif
3272  int size = extent.size() * sizeof(T) / sizeof(ElementType);
3273  using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
3274  array_view<const ElementType, 1> av(buffer_type(m_device), Concurrency::extent<1>(size), 0);
3275  return av;
3276  }
3277 
3293  template <int K> array_view<T, K>
3294  view_as(const Concurrency::extent<K>& viewExtent) restrict(amp,cpu) {
3295 #if __KALMAR_ACCELERATOR__ != 1
3296  if( viewExtent.size() > extent.size())
3297  throw runtime_exception("errorMsg_throw", 0);
3298 #endif
3299  array_view<T, K> av(m_device, viewExtent, 0);
3300  return av;
3301  }
3302  template <int K> array_view<const T, K>
3303  view_as(const Concurrency::extent<K>& viewExtent) const restrict(amp,cpu) {
3304 #if __KALMAR_ACCELERATOR__ != 1
3305  if( viewExtent.size() > extent.size())
3306  throw runtime_exception("errorMsg_throw", 0);
3307 #endif
3308  const array_view<T, K> av(m_device, viewExtent, 0);
3309  return av;
3310  }
3311 
3314  ~array() {}
3315 
3316  // FIXME: functions below are not defined in C++ AMP specification
3317  const acc_buffer_t& internal() const restrict(amp,cpu) { return m_device; }
3318  int get_offset() const restrict(amp,cpu) { return 0; }
3319  Concurrency::index<N> get_index_base() const restrict(amp,cpu) { return Concurrency::index<N>(); }
3320 private:
3321  template <typename K, int Q> friend struct projection_helper;
3322  template <typename K, int Q> friend struct array_projection_helper;
3323  acc_buffer_t m_device;
3325 
3326  template <typename Q, int K> friend
3327  void copy(const array<Q, K>&, const array_view<Q, K>&);
3328  template <typename Q, int K> friend
3329  void copy(const array_view<const Q, K>&, array<Q, K>&);
3330 };
3331 
3332 // ------------------------------------------------------------------------
3333 // utility classes for array_view
3334 // ------------------------------------------------------------------------
3335 
3336 template <typename T>
3338 {
3339 private:
3340  struct two {char __lx; char __lxx;};
3341  template <typename C> static char test(decltype(std::declval<C>().data()));
3342  template <typename C> static two test(...);
3343 public:
3344  static const bool value = sizeof(test<T>(0)) == 1;
3345 };
3346 
3347 template <typename T>
3349 {
3350 private:
3351  struct two {char __lx; char __lxx;};
3352  template <typename C> static char test(decltype(&C::size));
3353  template <typename C> static two test(...);
3354 public:
3355  static const bool value = sizeof(test<T>(0)) == 1;
3356 };
3357 
3358 template <typename T>
3360 {
3361  using _T = typename std::remove_reference<T>::type;
3362  static const bool value = __has_size<_T>::value && __has_data<_T>::value;
3363 };
3364 
3365 // ------------------------------------------------------------------------
3366 // array_view<T,N>
3367 // ------------------------------------------------------------------------
3368 
3375 template <typename T, int N = 1>
3376 class array_view
3377 {
3378  static_assert(0 == (sizeof(T) % sizeof(int)), "only value types whose size is a multiple of the size of an integer are allowed in array views");
3379 public:
3380  typedef typename std::remove_const<T>::type nc_T;
3381 #if __KALMAR_ACCELERATOR__ == 1
3382  typedef Kalmar::_data<T> acc_buffer_t;
3383 #else
3384  typedef Kalmar::_data_host<T> acc_buffer_t;
3385 #endif
3386 
3390  static const int rank = N;
3391 
3395  typedef T value_type;
3396 
3400  array_view() = delete;
3401 
3410  array_view(array<T, N>& src) restrict(amp,cpu)
3411  : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {}
3412 
3413  // FIXME: following interfaces were not implemented yet
3414  // template <typename Container>
3415  // explicit array_view<T, 1>::array_view(Container& src);
3416  // template <typename value_type, int Size>
3417  // explicit array_view<T, 1>::array_view(value_type (&src) [Size]) restrict(amp,cpu);
3418 
3429  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
3430  array_view(const Concurrency::extent<N>& extent, Container& src)
3431  : array_view(extent, src.data())
3432  { static_assert( std::is_same<decltype(src.data()), T*>::value, "container element type and array view element type must match"); }
3433 
3444  array_view(const Concurrency::extent<N>& ext, value_type* src) restrict(amp,cpu)
3445 #if __KALMAR_ACCELERATOR__ == 1
3446  : cache((T *)(src)), extent(ext), extent_base(ext), offset(0) {}
3447 #else
3448  : cache(ext.size(), (T *)(src)), extent(ext), extent_base(ext), offset(0) {}
3449 #endif
3450 
3461  explicit array_view(const Concurrency::extent<N>& ext)
3462  : cache(ext.size()), extent(ext), extent_base(ext), offset(0) {}
3463 
3474  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
3475  array_view(int e0, Container& src)
3476  : array_view(Concurrency::extent<N>(e0), src) {}
3477  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
3478  array_view(int e0, int e1, Container& src)
3479  : array_view(Concurrency::extent<N>(e0, e1), src) {}
3480  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
3481  array_view(int e0, int e1, int e2, Container& src)
3482  : array_view(Concurrency::extent<N>(e0, e1, e2), src) {}
3483 
3494  array_view(int e0, value_type *src) restrict(amp,cpu)
3495  : array_view(Concurrency::extent<N>(e0), src) {}
3496  array_view(int e0, int e1, value_type *src) restrict(amp,cpu)
3497  : array_view(Concurrency::extent<N>(e0, e1), src) {}
3498  array_view(int e0, int e1, int e2, value_type *src) restrict(amp,cpu)
3499  : array_view(Concurrency::extent<N>(e0, e1, e2), src) {}
3500 
3501 
3509  explicit array_view(int e0) restrict(cpu) : array_view(Concurrency::extent<N>(e0)) {}
3510  explicit array_view(int e0, int e1) restrict(cpu)
3511  : array_view(Concurrency::extent<N>(e0, e1)) {}
3512  explicit array_view(int e0, int e1, int e2) restrict(cpu)
3513  : array_view(Concurrency::extent<N>(e0, e1, e2)) {}
3514 
3523  array_view(const array_view& other) restrict(amp,cpu)
3524  : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
3525 
3529  extent<N> get_extent() const restrict(amp,cpu) { return extent; }
3530 
3540  accelerator_view get_source_accelerator_view() const { return cache.get_av(); }
3541 
3550  array_view& operator=(const array_view& other) restrict(amp,cpu) {
3551  if (this != &other) {
3552  cache = other.cache;
3553  extent = other.extent;
3554  index_base = other.index_base;
3555  extent_base = other.extent_base;
3556  offset = other.offset;
3557  }
3558  return *this;
3559  }
3560 
3568  void copy_to(array<T,N>& dest) const {
3569 #if __KALMAR_ACCELERATOR__ != 1
3570  for(int i= 0 ;i< N;i++) {
3571  if (dest.get_extent()[i] < this->extent[i])
3572  throw runtime_exception("errorMsg_throw", 0);
3573  }
3574 #endif
3575  copy(*this, dest);
3576  }
3577 
3585  void copy_to(const array_view& dest) const { copy(*this, dest); }
3586 
3602  T* data() const restrict(amp,cpu) {
3603 #if __KALMAR_ACCELERATOR__ != 1
3604  cache.get_cpu_access(true);
3605 #endif
3606  static_assert(N == 1, "data() is only permissible on array views of rank 1");
3607  return reinterpret_cast<T*>(cache.get() + offset + index_base[0]);
3608  }
3609 
3615  void refresh() const { cache.refresh(); }
3616 
3650  // FIXME: type parameter is not implemented
3651  void synchronize() const { cache.get_cpu_access(); }
3652 
3663  // FIXME: type parameter is not implemented
3665  std::future<void> fut = std::async([&]() mutable { synchronize(); });
3666  return completion_future(fut.share());
3667  }
3668 
3702  // FIXME: type parameter is not implemented
3703  void synchronize_to(const accelerator_view& av) const {
3704 #if __KALMAR_ACCELERATOR__ != 1
3705  cache.sync_to(av.pQueue);
3706 #endif
3707  }
3708 
3724  // FIXME: this method is not implemented yet
3725  completion_future synchronize_to_async(const accelerator_view& av) const;
3726 
3734  void discard_data() const {
3735 #if __KALMAR_ACCELERATOR__ != 1
3736  cache.discard();
3737 #endif
3738  }
3739 
3748  T& operator[] (const index<N>& idx) const restrict(amp,cpu) {
3749 #if __KALMAR_ACCELERATOR__ != 1
3750  cache.get_cpu_access(true);
3751 #endif
3752  T *ptr = reinterpret_cast<T*>(cache.get() + offset);
3753  return ptr[Kalmar::amp_helper<N, index<N>, Concurrency::extent<N>>::flatten(idx + index_base, extent_base)];
3754  }
3755 
3756  T& operator() (const index<N>& idx) const restrict(amp,cpu) {
3757  return (*this)[idx];
3758  }
3759 
3773  // FIXME: this method is not implemented
3774  T& get_ref(const index<N>& idx) const restrict(amp,cpu);
3775 
3784  T& operator() (int i0, int i1) const restrict(amp,cpu) {
3785  static_assert(N == 2, "T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
3786  return (*this)[index<2>(i0, i1)];
3787  }
3788 
3789  T& operator() (int i0, int i1, int i2) const restrict(amp,cpu) {
3790  static_assert(N == 3, "T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
3791  return (*this)[index<3>(i0, i1, i2)];
3792  }
3793 
3816  operator[] (int i) const restrict(amp,cpu) {
3817  return projection_helper<T, N>::project(*this, i);
3818  }
3819 
3821  operator() (int i0) const restrict(amp,cpu) { return (*this)[i0]; }
3822 
3844  const Concurrency::extent<N>& ext) const restrict(amp,cpu) {
3845 #if __KALMAR_ACCELERATOR__ != 1
3846  if ( !Kalmar::amp_helper<N, index<N>, Concurrency::extent<N>>::contains(idx, ext,this->extent ) )
3847  throw runtime_exception("errorMsg_throw", 0);
3848 #endif
3849  array_view<T, N> av(cache, ext, extent_base, idx + index_base, offset);
3850  return av;
3851  }
3852 
3856  array_view<T, N> section(const Concurrency::index<N>& idx) const restrict(amp,cpu) {
3858  Kalmar::amp_helper<N, Concurrency::index<N>, Concurrency::extent<N>>::minus(idx, ext);
3859  return section(idx, ext);
3860  }
3861 
3865  array_view<T, N> section(const Concurrency::extent<N>& ext) const restrict(amp,cpu) {
3867  return section(idx, ext);
3868  }
3869 
3880  array_view<T, 1> section(int i0, int e0) const restrict(amp,cpu) {
3881  static_assert(N == 1, "Rank must be 1");
3882  return section(Concurrency::index<1>(i0), Concurrency::extent<1>(e0));
3883  }
3884 
3885  array_view<T, 2> section(int i0, int i1, int e0, int e1) const restrict(amp,cpu) {
3886  static_assert(N == 2, "Rank must be 2");
3887  return section(Concurrency::index<2>(i0, i1), Concurrency::extent<2>(e0, e1));
3888  }
3889 
3890  array_view<T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) const restrict(amp,cpu) {
3891  static_assert(N == 3, "Rank must be 3");
3892  return section(Concurrency::index<3>(i0, i1, i2), Concurrency::extent<3>(e0, e1, e2));
3893  }
3894 
3908  template <typename ElementType>
3909  array_view<ElementType, N> reinterpret_as() const restrict(amp,cpu) {
3910  static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1");
3911 #if __KALMAR_ACCELERATOR__ != 1
3912  static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
3913  static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
3914  if ( (extent.size() * sizeof(T)) % sizeof(ElementType))
3915  throw runtime_exception("errorMsg_throw", 0);
3916 #endif
3917  int size = extent.size() * sizeof(T) / sizeof(ElementType);
3918  using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
3919  array_view<ElementType, 1> av(buffer_type(cache),
3920  Concurrency::extent<1>(size),
3921  (offset + index_base[0])* sizeof(T) / sizeof(ElementType));
3922  return av;
3923  }
3924 
3933  template <int K>
3934  array_view<T, K> view_as(Concurrency::extent<K> viewExtent) const restrict(amp,cpu) {
3935  static_assert(N == 1, "view_as is only permissible on array views of rank 1");
3936 #if __KALMAR_ACCELERATOR__ != 1
3937  if ( viewExtent.size() > extent.size())
3938  throw runtime_exception("errorMsg_throw", 0);
3939 #endif
3940  array_view<T, K> av(cache, viewExtent, offset + index_base[0]);
3941  return av;
3942  }
3943 
3944  ~array_view() restrict(amp,cpu) {}
3945 
3946  // FIXME: functions below are not defined in C++ AMP specification
3947  template <int D0, int D1=0, int D2=0>
3948  T& operator[] (const tiled_index<D0, D1, D2>& idx) const restrict(amp,cpu) {
3949 #if __KALMAR_ACCELERATOR__ != 1
3950  cache.get_cpu_access(true);
3951 #endif
3952  T *ptr = reinterpret_cast<T*>(cache.get() + offset);
3953  return ptr[Kalmar::amp_helper<N, index<N>, Concurrency::extent<N>>::flatten(idx.global + index_base, extent_base)];
3954  }
3955 
3956  const acc_buffer_t& internal() const restrict(amp,cpu) { return cache; }
3957 
3958  int get_offset() const restrict(amp,cpu) { return offset; }
3959 
3960  Concurrency::index<N> get_index_base() const restrict(amp,cpu) { return index_base; }
3961 
3962 private:
3963  template <typename K, int Q> friend struct projection_helper;
3964  template <typename K, int Q> friend struct array_projection_helper;
3965  template <typename Q, int K> friend class array;
3966  template <typename Q, int K> friend class array_view;
3967 
3968  template<typename Q, int K> friend
3969  bool is_flat(const array_view<Q, K>&) noexcept;
3970  template <typename Q, int K> friend
3971  void copy(const array<Q, K>&, const array_view<Q, K>&);
3972  template <typename InputIter, typename Q, int K> friend
3973  void copy(InputIter, InputIter, const array_view<Q, K>&);
3974  template <typename Q, int K> friend
3975  void copy(const array_view<const Q, K>&, array<Q, K>&);
3976  template <typename OutputIter, typename Q, int K> friend
3977  void copy(const array_view<Q, K>&, OutputIter);
3978  template <typename Q, int K> friend
3979  void copy(const array_view<const Q, K>& src, const array_view<Q, K>& dest);
3980 
3981  // used by view_as and reinterpret_as
3982  array_view(const acc_buffer_t& cache, const Concurrency::extent<N>& ext,
3983  int offset) restrict(amp,cpu)
3984  : cache(cache), extent(ext), extent_base(ext), offset(offset) {}
3985 
3986  // used by section and projection
3987  array_view(const acc_buffer_t& cache, const Concurrency::extent<N>& ext_now,
3988  const Concurrency::extent<N>& ext_b,
3989  const Concurrency::index<N>& idx_b, int off) restrict(amp,cpu)
3990  : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b), offset(off) {}
3991 
3992  acc_buffer_t cache;
3994  Concurrency::extent<N> extent_base;
3995  Concurrency::index<N> index_base;
3996  int offset;
3997 };
3998 
3999 // ------------------------------------------------------------------------
4000 // array_view<const T,N>
4001 // ------------------------------------------------------------------------
4002 
4010 template <typename T, int N>
4011 class array_view<const T, N>
4012 {
4013 public:
4014  typedef typename std::remove_const<T>::type nc_T;
4015 
4016 #if __KALMAR_ACCELERATOR__ == 1
4017  typedef Kalmar::_data<nc_T> acc_buffer_t;
4018 #else
4019  typedef Kalmar::_data_host<const T> acc_buffer_t;
4020 #endif
4021 
4025  static const int rank = N;
4026 
4030  typedef const T value_type;
4031 
4035  array_view() = delete;
4036 
4045  array_view(const array<T,N>& src) restrict(amp,cpu)
4046  : cache(src.internal()), extent(src.get_extent()), extent_base(extent), index_base(), offset(0) {}
4047 
4048  // FIXME: following interfaces were not implemented yet
4049  // template <typename Container>
4050  // explicit array_view<const T, 1>::array_view(const Container& src);
4051  // template <typename value_type, int Size>
4052  // explicit array_view<const T, 1>::array_view(const value_type (&src) [Size]) restrict(amp,cpu);
4053 
4064  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
4065  array_view(const extent<N>& extent, const Container& src)
4066  : array_view(extent, src.data())
4067  { static_assert( std::is_same<typename std::remove_const<typename std::remove_reference<decltype(*src.data())>::type>::type, T>::value, "container element type and array view element type must match"); }
4068 
4079  array_view(const extent<N>& ext, const value_type* src) restrict(amp,cpu)
4080 #if __KALMAR_ACCELERATOR__ == 1
4081  : cache((nc_T*)(src)), extent(ext), extent_base(ext), offset(0) {}
4082 #else
4083  : cache(ext.size(), src), extent(ext), extent_base(ext), offset(0) {}
4084 #endif
4085 
4096  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
4097  array_view(int e0, Container& src) : array_view(Concurrency::extent<1>(e0), src) {}
4098  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
4099  array_view(int e0, int e1, Container& src)
4100  : array_view(Concurrency::extent<N>(e0, e1), src) {}
4101  template <typename Container, class = typename std::enable_if<__is_container<Container>::value>::type>
4102  array_view(int e0, int e1, int e2, Container& src)
4103  : array_view(Concurrency::extent<N>(e0, e1, e2), src) {}
4104 
4115  array_view(int e0, const value_type *src) restrict(amp,cpu)
4116  : array_view(Concurrency::extent<1>(e0), src) {}
4117  array_view(int e0, int e1, const value_type *src) restrict(amp,cpu)
4118  : array_view(Concurrency::extent<2>(e0, e1), src) {}
4119  array_view(int e0, int e1, int e2, const value_type *src) restrict(amp,cpu)
4120  : array_view(Concurrency::extent<3>(e0, e1, e2), src) {}
4121 
4130  array_view(const array_view& other) restrict(amp,cpu)
4131  : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
4132 
4140  array_view(const array_view<nc_T, N>& other) restrict(amp,cpu)
4141  : cache(other.cache), extent(other.extent), extent_base(other.extent_base), index_base(other.index_base), offset(other.offset) {}
4142 
4146  extent<N> get_extent() const restrict(amp,cpu) { return extent; }
4147 
4157  accelerator_view get_source_accelerator_view() const { return cache.get_av(); }
4158 
4168  array_view& operator=(const array_view<T,N>& other) restrict(amp,cpu) {
4169  cache = other.cache;
4170  extent = other.extent;
4171  index_base = other.index_base;
4172  extent_base = other.extent_base;
4173  offset = other.offset;
4174  return *this;
4175  }
4176 
4177  array_view& operator=(const array_view& other) restrict(amp,cpu) {
4178  if (this != &other) {
4179  cache = other.cache;
4180  extent = other.extent;
4181  index_base = other.index_base;
4182  extent_base = other.extent_base;
4183  offset = other.offset;
4184  }
4185  return *this;
4186  }
4187 
4197  void copy_to(array<T,N>& dest) const { copy(*this, dest); }
4198 
4206  void copy_to(const array_view<T,N>& dest) const { copy(*this, dest); }
4207 
4223  const T* data() const restrict(amp,cpu) {
4224 #if __KALMAR_ACCELERATOR__ != 1
4225  cache.get_cpu_access();
4226 #endif
4227  static_assert(N == 1, "data() is only permissible on array views of rank 1");
4228  return reinterpret_cast<const T*>(cache.get() + offset + index_base[0]);
4229  }
4230 
4236  void refresh() const { cache.refresh(); }
4237 
4267  void synchronize() const { cache.get_cpu_access(); }
4268 
4280  std::future<void> fut = std::async([&]() mutable { synchronize(); });
4281  return completion_future(fut.share());
4282  }
4283 
4295  void synchronize_to(const accelerator_view& av) const {
4296 #if __KALMAR_ACCELERATOR__ != 1
4297  cache.sync_to(av.pQueue);
4298 #endif
4299  }
4300 
4316  // FIXME: this method is not implemented yet
4317  completion_future synchronize_to_async(const accelerator_view& av) const;
4318 
4327  const T& operator[] (const index<N>& idx) const restrict(amp,cpu) {
4328 #if __KALMAR_ACCELERATOR__ != 1
4329  cache.get_cpu_access();
4330 #endif
4331  const T *ptr = reinterpret_cast<const T*>(cache.get() + offset);
4332  return ptr[Kalmar::amp_helper<N, index<N>, Concurrency::extent<N>>::flatten(idx + index_base, extent_base)];
4333  }
4334  const T& operator() (const index<N>& idx) const restrict(amp,cpu) {
4335  return (*this)[idx];
4336  }
4337 
4351  // FIXME: this method is not implemented
4352  const T& get_ref(const index<N>& idx) const restrict(amp,cpu);
4353 
4362  const T& operator() (int i0) const restrict(amp,cpu) {
4363  static_assert(N == 1, "const T& array_view::operator()(int) is only permissible on array_view<T, 1>");
4364  return (*this)[index<1>(i0)];
4365  }
4366 
4367  const T& operator() (int i0, int i1) const restrict(amp,cpu) {
4368  static_assert(N == 2, "const T& array_view::operator()(int,int) is only permissible on array_view<T, 2>");
4369  return (*this)[index<2>(i0, i1)];
4370  }
4371  const T& operator() (int i0, int i1, int i2) const restrict(amp,cpu) {
4372  static_assert(N == 3, "const T& array_view::operator()(int,int, int) is only permissible on array_view<T, 3>");
4373  return (*this)[index<3>(i0, i1, i2)];
4374  }
4375 
4398  operator[] (int i) const restrict(amp,cpu) {
4399  return projection_helper<const T, N>::project(*this, i);
4400  }
4401 
4402  // FIXME: typename projection_helper<const T, N>::const_result_type
4403  // operator() (int i0) const restrict(cmp,cpu);
4404  // is not implemented
4405 
4427  const Concurrency::extent<N>& ext) const restrict(amp,cpu) {
4428  array_view<const T, N> av(cache, ext, extent_base, idx + index_base, offset);
4429  return av;
4430  }
4431 
4435  array_view<const T, N> section(const Concurrency::index<N>& idx) const restrict(amp,cpu) {
4437  Kalmar::amp_helper<N, Concurrency::index<N>, Concurrency::extent<N>>::minus(idx, ext);
4438  return section(idx, ext);
4439  }
4440 
4444  array_view<const T, N> section(const Concurrency::extent<N>& ext) const restrict(amp,cpu) {
4446  return section(idx, ext);
4447  }
4448 
4459  array_view<const T, 1> section(int i0, int e0) const restrict(amp,cpu) {
4460  static_assert(N == 1, "Rank must be 1");
4461  return section(Concurrency::index<1>(i0), Concurrency::extent<1>(e0));
4462  }
4463 
4464  array_view<const T, 2> section(int i0, int i1, int e0, int e1) const restrict(amp,cpu) {
4465  static_assert(N == 2, "Rank must be 2");
4466  return section(Concurrency::index<2>(i0, i1), Concurrency::extent<2>(e0, e1));
4467  }
4468 
4469  array_view<const T, 3> section(int i0, int i1, int i2, int e0, int e1, int e2) const restrict(amp,cpu) {
4470  static_assert(N == 3, "Rank must be 3");
4471  return section(Concurrency::index<3>(i0, i1, i2), Concurrency::extent<3>(e0, e1, e2));
4472  }
4473 
4487  template <typename ElementType>
4489  static_assert(N == 1, "reinterpret_as is only permissible on array views of rank 1");
4490 #if __KALMAR_ACCELERATOR__ != 1
4491  static_assert( ! (std::is_pointer<ElementType>::value ),"can't use pointer in the kernel");
4492  static_assert( ! (std::is_same<ElementType,short>::value ),"can't use short in the kernel");
4493 #endif
4494  int size = extent.size() * sizeof(T) / sizeof(ElementType);
4495  using buffer_type = typename array_view<ElementType, 1>::acc_buffer_t;
4496  array_view<const ElementType, 1> av(buffer_type(cache),
4497  Concurrency::extent<1>(size),
4498  (offset + index_base[0])* sizeof(T) / sizeof(ElementType));
4499  return av;
4500  }
4501 
4510  template <int K>
4511  array_view<const T, K> view_as(Concurrency::extent<K> viewExtent) const restrict(amp,cpu) {
4512  static_assert(N == 1, "view_as is only permissible on array views of rank 1");
4513 #if __KALMAR_ACCELERATOR__ != 1
4514  if ( viewExtent.size() > extent.size())
4515  throw runtime_exception("errorMsg_throw", 0);
4516 #endif
4517  array_view<const T, K> av(cache, viewExtent, offset + index_base[0]);
4518  return av;
4519  }
4520 
4521  ~array_view() restrict(amp,cpu) {}
4522 
4523  // FIXME: functions below are not defined in C++ AMP specification
4524  const acc_buffer_t& internal() const restrict(amp,cpu) { return cache; }
4525 
4526  int get_offset() const restrict(amp,cpu) { return offset; }
4527 
4528  Concurrency::index<N> get_index_base() const restrict(amp,cpu) { return index_base; }
4529 
4530 private:
4531  template <typename K, int Q> friend struct projection_helper;
4532  template <typename K, int Q> friend struct array_projection_helper;
4533  template <typename Q, int K> friend class array;
4534  template <typename Q, int K> friend class array_view;
4535 
4536  template<typename Q, int K> friend
4537  bool is_flat(const array_view<Q, K>&) noexcept;
4538  template <typename Q, int K> friend
4539  void copy(const array<Q, K>&, const array_view<Q, K>&);
4540  template <typename InputIter, typename Q, int K> friend
4541  void copy(InputIter, InputIter, const array_view<Q, K>&);
4542  template <typename Q, int K> friend
4543  void copy(const array_view<const Q, K>&, array<Q, K>&);
4544  template <typename OutputIter, typename Q, int K> friend
4545  void copy(const array_view<Q, K>&, OutputIter);
4546  template <typename Q, int K> friend
4547  void copy(const array_view<const Q, K>& src, const array_view<Q, K>& dest);
4548 
4549  // used by view_as and reinterpret_as
4550  array_view(const acc_buffer_t& cache, const Concurrency::extent<N>& ext,
4551  int offset) restrict(amp,cpu)
4552  : cache(cache), extent(ext), extent_base(ext), offset(offset) {}
4553 
4554  // used by section and projection
4555  array_view(const acc_buffer_t& cache, const Concurrency::extent<N>& ext_now,
4556  const Concurrency::extent<N>& ext_b,
4557  const Concurrency::index<N>& idx_b, int off) restrict(amp,cpu)
4558  : cache(cache), extent(ext_now), extent_base(ext_b), index_base(idx_b), offset(off) {}
4559 
4560  acc_buffer_t cache;
4562  Concurrency::extent<N> extent_base;
4563  Concurrency::index<N> index_base;
4564  int offset;
4565 };
4566 
4567 // ------------------------------------------------------------------------
4568 // global functions for extent
4569 // ------------------------------------------------------------------------
4570 
4581 // FIXME: the signature is not entirely the same as defined in:
4582 // C++AMP spec v1.2 #1253
4583 template <int N>
4584 extent<N> operator+(const extent<N>& lhs, const extent<N>& rhs) restrict(amp,cpu) {
4585  extent<N> __r = lhs;
4586  __r += rhs;
4587  return __r;
4588 }
4589 template <int N>
4590 extent<N> operator-(const extent<N>& lhs, const extent<N>& rhs) restrict(amp,cpu) {
4591  extent<N> __r = lhs;
4592  __r -= rhs;
4593  return __r;
4594 }
4595 
4612 // FIXME: the signature is not entirely the same as defined in:
4613 // C++AMP spec v1.2 #1259
4614 template <int N>
4615 extent<N> operator+(const extent<N>& ext, int value) restrict(amp,cpu) {
4616  extent<N> __r = ext;
4617  __r += value;
4618  return __r;
4619 }
4620 template <int N>
4621 extent<N> operator+(int value, const extent<N>& ext) restrict(amp,cpu) {
4622  extent<N> __r = ext;
4623  __r += value;
4624  return __r;
4625 }
4626 template <int N>
4627 extent<N> operator-(const extent<N>& ext, int value) restrict(amp,cpu) {
4628  extent<N> __r = ext;
4629  __r -= value;
4630  return __r;
4631 }
4632 template <int N>
4633 extent<N> operator-(int value, const extent<N>& ext) restrict(amp,cpu) {
4634  extent<N> __r(value);
4635  __r -= ext;
4636  return __r;
4637 }
4638 template <int N>
4639 extent<N> operator*(const extent<N>& ext, int value) restrict(amp,cpu) {
4640  extent<N> __r = ext;
4641  __r *= value;
4642  return __r;
4643 }
4644 template <int N>
4645 extent<N> operator*(int value, const extent<N>& ext) restrict(amp,cpu) {
4646  extent<N> __r = ext;
4647  __r *= value;
4648  return __r;
4649 }
4650 template <int N>
4651 extent<N> operator/(const extent<N>& ext, int value) restrict(amp,cpu) {
4652  extent<N> __r = ext;
4653  __r /= value;
4654  return __r;
4655 }
4656 template <int N>
4657 extent<N> operator/(int value, const extent<N>& ext) restrict(amp,cpu) {
4658  extent<N> __r(value);
4659  __r /= ext;
4660  return __r;
4661 }
4662 template <int N>
4663 extent<N> operator%(const extent<N>& ext, int value) restrict(amp,cpu) {
4664  extent<N> __r = ext;
4665  __r %= value;
4666  return __r;
4667 }
4668 template <int N>
4669 extent<N> operator%(int value, const extent<N>& ext) restrict(amp,cpu) {
4670  extent<N> __r(value);
4671  __r %= ext;
4672  return __r;
4673 }
4674 
4677 // ------------------------------------------------------------------------
4678 // utility functions for copy
4679 // ------------------------------------------------------------------------
4680 
4681 template<typename T, int N>
4682 static inline bool is_flat(const array_view<T, N>& av) noexcept {
4683  return av.extent == av.extent_base && av.index_base == index<N>();
4684 }
4685 
4686 template<typename T>
4687 static inline bool is_flat(const array_view<T, 1>& av) noexcept { return true; }
4688 
4689 template <typename InputIter, typename T, int N, int dim>
4691 {
4692  void operator()(InputIter& It, T* ptr, const extent<N>& ext,
4693  const extent<N>& base, const index<N>& idx)
4694  {
4695  size_t stride = 1;
4696  for (int i = dim; i < N; i++)
4697  stride *= base[i];
4698  ptr += stride * idx[dim - 1];
4699  for (int i = 0; i < ext[dim - 1]; i++) {
4700  copy_input<InputIter, T, N, dim + 1>()(It, ptr, ext, base, idx);
4701  ptr += stride;
4702  }
4703  }
4704 };
4705 
4706 template <typename InputIter, typename T, int N>
4707 struct copy_input<InputIter, T, N, N>
4708 {
4709  void operator()(InputIter& It, T* ptr, const extent<N>& ext,
4710  const extent<N>& base, const index<N>& idx)
4711  {
4712  InputIter end = It;
4713  std::advance(end, ext[N - 1]);
4714  std::copy(It, end, ptr + idx[N - 1]);
4715  It = end;
4716  }
4717 };
4718 
4719 template <typename OutputIter, typename T, int N, int dim>
4721 {
4722  void operator()(const T* ptr, OutputIter& It, const extent<N>& ext,
4723  const extent<N>& base, const index<N>& idx)
4724  {
4725  size_t stride = 1;
4726  for (int i = dim; i < N; i++)
4727  stride *= base[i];
4728  ptr += stride * idx[dim - 1];
4729  for (int i = 0; i < ext[dim - 1]; i++) {
4730  copy_output<OutputIter, T, N, dim + 1>()(ptr, It, ext, base, idx);
4731  ptr += stride;
4732  }
4733  }
4734 };
4735 
4736 template <typename OutputIter, typename T, int N>
4737 struct copy_output<OutputIter, T, N, N>
4738 {
4739  void operator()(const T* ptr, OutputIter& It, const extent<N>& ext,
4740  const extent<N>& base, const index<N>& idx)
4741  {
4742  ptr += idx[N - 1];
4743  It = std::copy(ptr, ptr + ext[N - 1], It);
4744  }
4745 };
4746 
4747 template <typename T, int N, int dim>
4749 {
4750  void operator()(const T* src, T* dst, const extent<N>& ext,
4751  const extent<N>& base1, const index<N>& idx1,
4752  const extent<N>& base2, const index<N>& idx2)
4753  {
4754  size_t stride1 = 1;
4755  for (int i = dim; i < N; i++)
4756  stride1 *= base1[i];
4757  src += stride1 * idx1[dim - 1];
4758 
4759  size_t stride2 = 1;
4760  for (int i = dim; i < N; i++)
4761  stride2 *= base2[i];
4762  dst += stride2 * idx2[dim - 1];
4763 
4764  for (int i = 0; i < ext[dim - 1]; i++) {
4765  copy_bidir<T, N, dim + 1>()(src, dst, ext, base1, idx1, base2, idx2);
4766  src += stride1;
4767  dst += stride2;
4768  }
4769  }
4770 };
4771 
4772 template <typename T, int N>
4773 struct copy_bidir<T, N, N>
4774 {
4775  void operator()(const T* src, T* dst, const extent<N>& ext,
4776  const extent<N>& base1, const index<N>& idx1,
4777  const extent<N>& base2, const index<N>& idx2)
4778  {
4779  src += idx1[N - 1];
4780  dst += idx2[N - 1];
4781  std::copy(src, src + ext[N - 1], dst);
4782  }
4783 };
4784 
4785 template <typename Iter, typename T, int N>
4786 struct do_copy
4787 {
4788  template<template <typename, int> class _amp_container>
4789  void operator()(Iter srcBegin, Iter srcEnd, const _amp_container<T, N>& dest) {
4790  size_t size = dest.get_extent().size();
4791  size_t offset = dest.get_offset();
4792  bool modify = true;
4793 
4794  T* ptr = dest.internal().map_ptr(modify, size, offset);
4795  std::copy(srcBegin, srcEnd, ptr);
4796  dest.internal().unmap_ptr(ptr, modify, size, offset);
4797  }
4798  template<template <typename, int> class _amp_container>
4799  void operator()(const _amp_container<T, N> &src, Iter destBegin) {
4800  size_t size = src.get_extent().size();
4801  size_t offset = src.get_offset();
4802  bool modify = false;
4803 
4804  const T* ptr = src.internal().map_ptr(modify, size, offset);
4805  std::copy(ptr, ptr + src.get_extent().size(), destBegin);
4806  src.internal().unmap_ptr(ptr, modify, size, offset);
4807  }
4808 };
4809 
4810 template <typename Iter, typename T>
4811 struct do_copy<Iter, T, 1>
4812 {
4813  template<template <typename, int> class _amp_container>
4814  void operator()(Iter srcBegin, Iter srcEnd, const _amp_container<T, 1>& dest) {
4815  size_t size = dest.get_extent().size();
4816  size_t offset = dest.get_offset() + dest.get_index_base()[0];
4817  bool modify = true;
4818 
4819  T* ptr = dest.internal().map_ptr(modify, size, offset);
4820  std::copy(srcBegin, srcEnd, ptr);
4821  dest.internal().unmap_ptr(ptr, modify, size, offset);
4822  }
4823  template<template <typename, int> class _amp_container>
4824  void operator()(const _amp_container<T, 1> &src, Iter destBegin) {
4825  size_t size = src.get_extent().size();
4826  size_t offset = src.get_offset() + src.get_index_base()[0];
4827  bool modify = false;
4828 
4829  const T* ptr = src.internal().map_ptr(modify, size, offset);
4830  std::copy(ptr, ptr + src.get_extent().size(), destBegin);
4831  src.internal().unmap_ptr(ptr, modify, size, offset);
4832  }
4833 };
4834 
4835 template <typename T, int N>
4836 struct do_copy<T*, T, N>
4837 {
4838  template<template <typename, int> class _amp_container>
4839  void operator()(T* srcBegin, T* srcEnd, const _amp_container<T, N>& dest) {
4840  dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd), dest.get_offset(), true);
4841  }
4842  template<template <typename, int> class _amp_container>
4843  void operator()(const _amp_container<T, N> &src, T* destBegin) {
4844  src.internal().read(destBegin, src.get_extent().size(), src.get_offset());
4845  }
4846 };
4847 
4848 template <typename T>
4849 struct do_copy<T*, T, 1>
4850 {
4851  template<template <typename, int> class _amp_container>
4852  void operator()(const T* srcBegin, const T* srcEnd, const _amp_container<T, 1>& dest) {
4853  dest.internal().write(srcBegin, std::distance(srcBegin, srcEnd),
4854  dest.get_offset() + dest.get_index_base()[0], true);
4855  }
4856  template<template <typename, int> class _amp_container>
4857  void operator()(const _amp_container<T, 1> &src, T* destBegin) {
4858  src.internal().read(destBegin, src.get_extent().size(),
4859  src.get_offset() + src.get_index_base()[0]);
4860  }
4861 };
4862 
4863 // ------------------------------------------------------------------------
4864 // copy
4865 // ------------------------------------------------------------------------
4866 
4875 template <typename T, int N>
4876 void copy(const array<T, N>& src, array<T, N>& dest) {
4877  src.internal().copy(dest.internal(), 0, 0, 0);
4878 }
4879 
4888 template <typename T, int N>
4889 void copy(const array<T, N>& src, const array_view<T, N>& dest) {
4890  if (is_flat(dest))
4891  src.internal().copy(dest.internal(), src.get_offset(),
4892  dest.get_offset(), dest.get_extent().size());
4893  else {
4894  // FIXME: logic here deserve to be reviewed
4895  size_t srcSize = src.extent.size();
4896  size_t srcOffset = 0;
4897  bool srcModify = false;
4898  size_t destSize = dest.extent_base.size();
4899  size_t destOffset = dest.offset;
4900  bool destModify = true;
4901 
4902  T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
4903  T* p = pSrc;
4904  T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
4905  copy_input<T*, T, N, 1>()(pSrc, pDst, dest.extent, dest.extent_base, dest.index_base);
4906  dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
4907  src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
4908  }
4909 }
4910 
4911 template <typename T>
4912 void copy(const array<T, 1>& src, const array_view<T, 1>& dest) {
4913  src.internal().copy(dest.internal(),
4914  src.get_offset() + src.get_index_base()[0],
4915  dest.get_offset() + dest.get_index_base()[0],
4916  dest.get_extent().size());
4917 }
4918 
4930 template <typename T, int N>
4931 void copy(const array_view<const T, N>& src, array<T, N>& dest) {
4932  if (is_flat(src)) {
4933  src.internal().copy(dest.internal(), src.get_offset(),
4934  dest.get_offset(), dest.get_extent().size());
4935  } else {
4936  // FIXME: logic here deserve to be reviewed
4937  size_t srcSize = src.extent_base.size();
4938  size_t srcOffset = src.offset;
4939  bool srcModify = false;
4940  size_t destSize = dest.extent.size();
4941  size_t destOffset = 0;
4942  bool destModify = true;
4943 
4944  T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
4945  T* p = pDst;
4946  const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
4947  copy_output<T*, T, N, 1>()(pSrc, pDst, src.extent, src.extent_base, src.index_base);
4948  src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
4949  dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
4950  }
4951 }
4952 
4953 template <typename T, int N>
4954 void copy(const array_view<T, N>& src, array<T, N>& dest) {
4955  const array_view<const T, N> buf(src);
4956  copy(buf, dest);
4957 }
4958 
4959 template <typename T>
4960 void copy(const array_view<const T, 1>& src, array<T, 1>& dest) {
4961  src.internal().copy(dest.internal(),
4962  src.get_offset() + src.get_index_base()[0],
4963  dest.get_offset() + dest.get_index_base()[0],
4964  dest.get_extent().size());
4965 }
4966 
4978 template <typename T, int N>
4979 void copy(const array_view<const T, N>& src, const array_view<T, N>& dest) {
4980  if (is_flat(src)) {
4981  if (is_flat(dest))
4982  src.internal().copy(dest.internal(), src.get_offset(),
4983  dest.get_offset(), dest.get_extent().size());
4984  else {
4985  // FIXME: logic here deserve to be reviewed
4986  size_t srcSize = src.extent.size();
4987  size_t srcOffset = 0;
4988  bool srcModify = false;
4989  size_t destSize = dest.extent_base.size();
4990  size_t destOffset = dest.offset;
4991  bool destModify = true;
4992 
4993  const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
4994  const T* p = pSrc;
4995  T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
4996  copy_input<const T*, T, N, 1>()(pSrc, pDst, dest.extent, dest.extent_base, dest.index_base);
4997  dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
4998  src.internal().unmap_ptr(p, srcModify, srcSize, srcOffset);
4999  }
5000  } else {
5001  if (is_flat(dest)) {
5002  // FIXME: logic here deserve to be reviewed
5003  size_t srcSize = src.extent_base.size();
5004  size_t srcOffset = src.offset;
5005  bool srcModify = false;
5006  size_t destSize = dest.extent.size();
5007  size_t destOffset = 0;
5008  bool destModify = true;
5009 
5010  T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
5011  T* p = pDst;
5012  const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
5013  copy_output<T*, T, N, 1>()(pSrc, pDst, src.extent, src.extent_base, src.index_base);
5014  dest.internal().unmap_ptr(p, destModify, destSize, destOffset);
5015  src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
5016  } else {
5017  // FIXME: logic here deserve to be reviewed
5018  size_t srcSize = src.extent_base.size();
5019  size_t srcOffset = src.offset;
5020  bool srcModify = false;
5021  size_t destSize = dest.extent_base.size();
5022  size_t destOffset = dest.offset;
5023  bool destModify = true;
5024 
5025  const T* pSrc = src.internal().map_ptr(srcModify, srcSize, srcOffset);
5026  T* pDst = dest.internal().map_ptr(destModify, destSize, destOffset);
5027  copy_bidir<T, N, 1>()(pSrc, pDst, src.extent, src.extent_base,
5028  src.index_base, dest.extent_base, dest.index_base);
5029  dest.internal().unmap_ptr(pDst, destModify, destSize, destOffset);
5030  src.internal().unmap_ptr(pSrc, srcModify, srcSize, srcOffset);
5031  }
5032  }
5033 }
5034 
5035 template <typename T, int N>
5036 void copy(const array_view<T, N>& src, const array_view<T, N>& dest) {
5037  const array_view<const T, N> buf(src);
5038  copy(buf, dest);
5039 }
5040 
5041 template <typename T>
5042 void copy(const array_view<const T, 1>& src, const array_view<T, 1>& dest) {
5043  src.internal().copy(dest.internal(),
5044  src.get_offset() + src.get_index_base()[0],
5045  dest.get_offset() + dest.get_index_base()[0],
5046  dest.get_extent().size());
5047 }
5048 
5065 template <typename InputIter, typename T, int N>
5066 void copy(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest) {
5067 #if __KALMAR_ACCELERATOR__ != 1
5068  if( ( std::distance(srcBegin,srcEnd) <=0 )||( std::distance(srcBegin,srcEnd) < dest.get_extent().size() ))
5069  throw runtime_exception("errorMsg_throw ,copy between different types", 0);
5070 #endif
5071  do_copy<InputIter, T, N>()(srcBegin, srcEnd, dest);
5072 }
5073 
5074 template <typename InputIter, typename T, int N>
5075 void copy(InputIter srcBegin, array<T, N>& dest) {
5076  InputIter srcEnd = srcBegin;
5077  std::advance(srcEnd, dest.get_extent().size());
5078  Concurrency::copy(srcBegin, srcEnd, dest);
5079 }
5080 
5097 template <typename InputIter, typename T, int N>
5098 void copy(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest) {
5099  if (is_flat(dest))
5100  do_copy<InputIter, T, N>()(srcBegin, srcEnd, dest);
5101  else {
5102  size_t size = dest.extent_base.size();
5103  size_t offset = dest.offset;
5104  bool modify = true;
5105 
5106  T* ptr = dest.internal().map_ptr(modify, size, offset);
5107  copy_input<InputIter, T, N, 1>()(srcBegin, ptr, dest.extent, dest.extent_base, dest.index_base);
5108  dest.internal().unmap_ptr(ptr, modify, size, offset);
5109  }
5110 }
5111 
5112 template <typename InputIter, typename T, int N>
5113 void copy(InputIter srcBegin, const array_view<T, N>& dest) {
5114  InputIter srcEnd = srcBegin;
5115  std::advance(srcEnd, dest.get_extent().size());
5116  copy(srcBegin, srcEnd, dest);
5117 }
5118 
5131 template <typename OutputIter, typename T, int N>
5132 void copy(const array<T, N> &src, OutputIter destBegin) {
5133  do_copy<OutputIter, T, N>()(src, destBegin);
5134 }
5135 
5146 template <typename OutputIter, typename T, int N>
5147 void copy(const array_view<T, N> &src, OutputIter destBegin) {
5148  if (is_flat(src))
5149  do_copy<OutputIter, T, N>()(src, destBegin);
5150  else {
5151  size_t size = src.extent_base.size();
5152  size_t offset = src.offset;
5153  bool modify = false;
5154 
5155  T* ptr = src.internal().map_ptr(modify, size, offset);
5156  copy_output<OutputIter, T, N, 1>()(ptr, destBegin, src.extent, src.extent_base, src.index_base);
5157  src.internal().unmap_ptr(ptr, modify, size, offset);
5158  }
5159 }
5160 
5161 // ------------------------------------------------------------------------
5162 // utility function for copy_async
5163 // ------------------------------------------------------------------------
5164 
5165 // ------------------------------------------------------------------------
5166 // copy_async
5167 // ------------------------------------------------------------------------
5168 
5177 template <typename T, int N>
5179  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
5180  return completion_future(fut.share());
5181 }
5182 
5190 template <typename T, int N>
5192  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
5193  return completion_future(fut.share());
5194 }
5195 
5205 template <typename T, int N>
5207  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
5208  return completion_future(fut.share());
5209 }
5210 
5211 template <typename T, int N>
5213  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
5214  return completion_future(fut.share());
5215 }
5216 
5228 template <typename T, int N>
5230  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
5231  return completion_future(fut.share());
5232 }
5233 
5234 template <typename T, int N>
5236  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
5237  return completion_future(fut.share());
5238 }
5239 
5256 template <typename InputIter, typename T, int N>
5257 completion_future copy_async(InputIter srcBegin, InputIter srcEnd, array<T, N>& dest) {
5258  std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]() mutable { copy(srcBegin, srcEnd, dest); });
5259  return completion_future(fut.share());
5260 }
5261 
5262 template <typename InputIter, typename T, int N>
5263 completion_future copy_async(InputIter srcBegin, array<T, N>& dest) {
5264  std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]() mutable { copy(srcBegin, dest); });
5265  return completion_future(fut.share());
5266 }
5267 
5284 template <typename InputIter, typename T, int N>
5285 completion_future copy_async(InputIter srcBegin, InputIter srcEnd, const array_view<T, N>& dest) {
5286  std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin, srcEnd]() mutable { copy(srcBegin, srcEnd, dest); });
5287  return completion_future(fut.share());
5288 }
5289 
5290 template <typename InputIter, typename T, int N>
5291 completion_future copy_async(InputIter srcBegin, const array_view<T, N>& dest) {
5292  std::future<void> fut = std::async(std::launch::deferred, [&, srcBegin]() mutable { copy(srcBegin, dest); });
5293  return completion_future(fut.share());
5294 }
5295 
5308 template <typename OutputIter, typename T, int N>
5309 completion_future copy_async(const array<T, N>& src, OutputIter destBegin) {
5310  std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]() mutable { copy(src, destBegin); });
5311  return completion_future(fut.share());
5312 }
5313 
5324 template <typename OutputIter, typename T, int N>
5325 completion_future copy_async(const array_view<T, N>& src, OutputIter destBegin) {
5326  std::future<void> fut = std::async(std::launch::deferred, [&, destBegin]() mutable { copy(src, destBegin); });
5327  return completion_future(fut.share());
5328 }
5329 
5330 // FIXME: these functions are not defined in C++ AMP specification
5331 template <typename T, int N>
5332 completion_future copy_async(const array<T, N>& src, const array<T, N>& dest) {
5333  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
5334  return completion_future(fut.share());
5335 }
5336 
5337 template <typename T, int N>
5339  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
5340  return completion_future(fut.share());
5341 }
5342 
5343 template <typename T, int N>
5344 completion_future copy_async(const array_view<T, N>& src, const array<T, N>& dest) {
5345  std::future<void> fut = std::async(std::launch::deferred, [&]() mutable { copy(src, dest); });
5346  return completion_future(fut.share());
5347 }
5348 
5349 // ------------------------------------------------------------------------
5350 // atomic functions
5351 // ------------------------------------------------------------------------
5352 
5368 #if __KALMAR_ACCELERATOR__ == 1
5369 extern "C" unsigned int atomic_exchange_unsigned(unsigned int *p, unsigned int val) restrict(amp);
5370 extern "C" int atomic_exchange_int(int *p, int val) restrict(amp);
5371 extern "C" float atomic_exchange_float(float *p, float val) restrict(amp);
5372 
5373 static inline unsigned int atomic_exchange(unsigned int * dest, unsigned int val) restrict(amp,cpu) {
5374  return atomic_exchange_unsigned(dest, val);
5375 }
5376 static inline int atomic_exchange(int * dest, int val) restrict(amp,cpu) {
5377  return atomic_exchange_int(dest, val);
5378 }
5379 static inline float atomic_exchange(float * dest, float val) restrict(amp,cpu) {
5380  return atomic_exchange_float(dest, val);
5381 }
5382 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
5383 unsigned int atomic_exchange_unsigned(unsigned int *p, unsigned int val);
5384 int atomic_exchange_int(int *p, int val);
5385 float atomic_exchange_float(float *p, float val);
5386 
5387 static inline unsigned int atomic_exchange(unsigned int *dest, unsigned int val) restrict(amp,cpu) {
5388  return atomic_exchange_unsigned(dest, val);
5389 }
5390 static inline int atomic_exchange(int *dest, int val) restrict(amp,cpu) {
5391  return atomic_exchange_int(dest, val);
5392 }
5393 static inline float atomic_exchange(float *dest, float val) restrict(amp,cpu) {
5394  return atomic_exchange_float(dest, val);
5395 }
5396 #else
5397 extern unsigned int atomic_exchange(unsigned int *dest, unsigned int val) restrict(amp,cpu);
5398 extern int atomic_exchange(int *dest, int val) restrict(amp, cpu);
5399 extern float atomic_exchange(float *dest, float val) restrict(amp, cpu);
5400 #endif
5401 
5434 #if __KALMAR_ACCELERATOR__ == 1
5435 extern "C" unsigned int atomic_compare_exchange_unsigned(unsigned int *dest, unsigned int expected_val, unsigned int val) restrict(amp);
5436 extern "C" int atomic_compare_exchange_int(int *dest, int expected_val, int val) restrict(amp);
5437 
5438 static inline bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) restrict(amp,cpu) {
5439  *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
5440  return (*dest == val);
5441 }
5442 static inline bool atomic_compare_exchange(int *dest, int *expected_val, int val) restrict(amp,cpu) {
5443  *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
5444  return (*dest == val);
5445 }
5446 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
5447 unsigned int atomic_compare_exchange_unsigned(unsigned int *dest, unsigned int expected_val, unsigned int val);
5448 int atomic_compare_exchange_int(int *dest, int expected_val, int val);
5449 
5450 static inline bool atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) restrict(amp,cpu) {
5451  *expected_val = atomic_compare_exchange_unsigned(dest, *expected_val, val);
5452  return (*dest == val);
5453 }
5454 static inline bool atomic_compare_exchange(int *dest, int *expected_val, int val) restrict(amp,cpu) {
5455  *expected_val = atomic_compare_exchange_int(dest, *expected_val, val);
5456  return (*dest == val);
5457 }
5458 #else
5459 extern unsigned int atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val) restrict(amp,cpu);
5460 extern int atomic_compare_exchange(int *dest, int *expected_val, int val) restrict(amp, cpu);
5461 #endif
5462 
5493 #if __KALMAR_ACCELERATOR__ == 1
5494 extern "C" unsigned int atomic_add_unsigned(unsigned int *p, unsigned int val) restrict(amp);
5495 extern "C" int atomic_add_int(int *p, int val) restrict(amp);
5496 extern "C" float atomic_add_float(float *p, float val) restrict(amp);
5497 
5498 static inline unsigned int atomic_fetch_add(unsigned int *x, unsigned int y) restrict(amp,cpu) {
5499  return atomic_add_unsigned(x, y);
5500 }
5501 static inline int atomic_fetch_add(int *x, int y) restrict(amp,cpu) {
5502  return atomic_add_int(x, y);
5503 }
5504 static inline float atomic_fetch_add(float *x, float y) restrict(amp,cpu) {
5505  return atomic_add_float(x, y);
5506 }
5507 
5508 extern "C" unsigned int atomic_sub_unsigned(unsigned int *p, unsigned int val) restrict(amp);
5509 extern "C" int atomic_sub_int(int *p, int val) restrict(amp);
5510 extern "C" float atomic_sub_float(float *p, float val) restrict(amp);
5511 
5512 static inline unsigned int atomic_fetch_sub(unsigned int *x, unsigned int y) restrict(amp,cpu) {
5513  return atomic_sub_unsigned(x, y);
5514 }
5515 static inline int atomic_fetch_sub(int *x, int y) restrict(amp,cpu) {
5516  return atomic_sub_int(x, y);
5517 }
5518 static inline int atomic_fetch_sub(float *x, float y) restrict(amp,cpu) {
5519  return atomic_sub_float(x, y);
5520 }
5521 
5522 extern "C" unsigned int atomic_and_unsigned(unsigned int *p, unsigned int val) restrict(amp);
5523 extern "C" int atomic_and_int(int *p, int val) restrict(amp);
5524 
5525 static inline unsigned int atomic_fetch_and(unsigned int *x, unsigned int y) restrict(amp,cpu) {
5526  return atomic_and_unsigned(x, y);
5527 }
5528 static inline int atomic_fetch_and(int *x, int y) restrict(amp,cpu) {
5529  return atomic_and_int(x, y);
5530 }
5531 
5532 extern "C" unsigned int atomic_or_unsigned(unsigned int *p, unsigned int val) restrict(amp);
5533 extern "C" int atomic_or_int(int *p, int val) restrict(amp);
5534 
5535 static inline unsigned int atomic_fetch_or(unsigned int *x, unsigned int y) restrict(amp,cpu) {
5536  return atomic_or_unsigned(x, y);
5537 }
5538 static inline int atomic_fetch_or(int *x, int y) restrict(amp,cpu) {
5539  return atomic_or_int(x, y);
5540 }
5541 
5542 extern "C" unsigned int atomic_xor_unsigned(unsigned int *p, unsigned int val) restrict(amp);
5543 extern "C" int atomic_xor_int(int *p, int val) restrict(amp);
5544 
5545 static inline unsigned int atomic_fetch_xor(unsigned int *x, unsigned int y) restrict(amp,cpu) {
5546  return atomic_xor_unsigned(x, y);
5547 }
5548 static inline int atomic_fetch_xor(int *x, int y) restrict(amp,cpu) {
5549  return atomic_xor_int(x, y);
5550 }
5551 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
5552 unsigned int atomic_add_unsigned(unsigned int *p, unsigned int val);
5553 int atomic_add_int(int *p, int val);
5554 float atomic_add_float(float *p, float val);
5555 
5556 static inline unsigned int atomic_fetch_add(unsigned int *x, unsigned int y) restrict(amp,cpu) {
5557  return atomic_add_unsigned(x, y);
5558 }
5559 static inline int atomic_fetch_add(int *x, int y) restrict(amp,cpu) {
5560  return atomic_add_int(x, y);
5561 }
5562 static inline float atomic_fetch_add(float *x, float y) restrict(amp,cpu) {
5563  return atomic_add_float(x, y);
5564 }
5565 
5566 unsigned int atomic_sub_unsigned(unsigned int *p, unsigned int val);
5567 int atomic_sub_int(int *p, int val);
5568 float atomic_sub_float(float *p, float val);
5569 
5570 static inline unsigned int atomic_fetch_sub(unsigned int *x, unsigned int y) restrict(amp,cpu) {
5571  return atomic_sub_unsigned(x, y);
5572 }
5573 static inline int atomic_fetch_sub(int *x, int y) restrict(amp,cpu) {
5574  return atomic_sub_int(x, y);
5575 }
5576 static inline float atomic_fetch_sub(float *x, float y) restrict(amp,cpu) {
5577  return atomic_sub_float(x, y);
5578 }
5579 
5580 unsigned int atomic_and_unsigned(unsigned int *p, unsigned int val);
5581 int atomic_and_int(int *p, int val);
5582 
5583 static inline unsigned int atomic_fetch_and(unsigned int *x, unsigned int y) restrict(amp,cpu) {
5584  return atomic_and_unsigned(x, y);
5585 }
5586 static inline int atomic_fetch_and(int *x, int y) restrict(amp,cpu) {
5587  return atomic_and_int(x, y);
5588 }
5589 
5590 unsigned int atomic_or_unsigned(unsigned int *p, unsigned int val);
5591 int atomic_or_int(int *p, int val);
5592 
5593 static inline unsigned int atomic_fetch_or(unsigned int *x, unsigned int y) restrict(amp,cpu) {
5594  return atomic_or_unsigned(x, y);
5595 }
5596 static inline int atomic_fetch_or(int *x, int y) restrict(amp,cpu) {
5597  return atomic_or_int(x, y);
5598 }
5599 
5600 unsigned int atomic_xor_unsigned(unsigned int *p, unsigned int val);
5601 int atomic_xor_int(int *p, int val);
5602 
5603 static inline unsigned int atomic_fetch_xor(unsigned int *x, unsigned int y) restrict(amp,cpu) {
5604  return atomic_xor_unsigned(x, y);
5605 }
5606 static inline int atomic_fetch_xor(int *x, int y) restrict(amp,cpu) {
5607  return atomic_xor_int(x, y);
5608 }
5609 #else
5610 extern unsigned atomic_fetch_add(unsigned *x, unsigned y) restrict(amp,cpu);
5611 extern int atomic_fetch_add(int *x, int y) restrict(amp, cpu);
5612 extern float atomic_fetch_add(float *x, float y) restrict(amp, cpu);
5613 
5614 extern unsigned atomic_fetch_sub(unsigned *x, unsigned y) restrict(amp,cpu);
5615 extern int atomic_fetch_sub(int *x, int y) restrict(amp, cpu);
5616 extern float atomic_fetch_sub(float *x, float y) restrict(amp, cpu);
5617 
5618 extern unsigned atomic_fetch_and(unsigned *x, unsigned y) restrict(amp,cpu);
5619 extern int atomic_fetch_and(int *x, int y) restrict(amp, cpu);
5620 
5621 extern unsigned atomic_fetch_or(unsigned *x, unsigned y) restrict(amp,cpu);
5622 extern int atomic_fetch_or(int *x, int y) restrict(amp, cpu);
5623 
5624 extern unsigned atomic_fetch_xor(unsigned *x, unsigned y) restrict(amp,cpu);
5625 extern int atomic_fetch_xor(int *x, int y) restrict(amp, cpu);
5626 #endif
5627 
5628 #if __KALMAR_ACCELERATOR__ == 1
5629 extern "C" unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val) restrict(amp);
5630 extern "C" int atomic_max_int(int *p, int val) restrict(amp);
5631 
5632 static inline unsigned int atomic_fetch_max(unsigned int *x, unsigned int y) restrict(amp) {
5633  return atomic_max_unsigned(x, y);
5634 }
5635 static inline int atomic_fetch_max(int *x, int y) restrict(amp) {
5636  return atomic_max_int(x, y);
5637 }
5638 
5639 extern "C" unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val) restrict(amp);
5640 extern "C" int atomic_min_int(int *p, int val) restrict(amp);
5641 
5642 static inline unsigned int atomic_fetch_min(unsigned int *x, unsigned int y) restrict(amp) {
5643  return atomic_min_unsigned(x, y);
5644 }
5645 static inline int atomic_fetch_min(int *x, int y) restrict(amp) {
5646  return atomic_min_int(x, y);
5647 }
5648 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
5649 unsigned int atomic_max_unsigned(unsigned int *p, unsigned int val);
5650 int atomic_max_int(int *p, int val);
5651 
5652 static inline unsigned int atomic_fetch_max(unsigned int *x, unsigned int y) restrict(amp) {
5653  return atomic_max_unsigned(x, y);
5654 }
5655 static inline int atomic_fetch_max(int *x, int y) restrict(amp) {
5656  return atomic_max_int(x, y);
5657 }
5658 
5659 unsigned int atomic_min_unsigned(unsigned int *p, unsigned int val);
5660 int atomic_min_int(int *p, int val);
5661 
5662 static inline unsigned int atomic_fetch_min(unsigned int *x, unsigned int y) restrict(amp) {
5663  return atomic_min_unsigned(x, y);
5664 }
5665 static inline int atomic_fetch_min(int *x, int y) restrict(amp) {
5666  return atomic_min_int(x, y);
5667 }
5668 #else
5669 extern int atomic_fetch_max(int * dest, int val) restrict(amp, cpu);
5670 extern unsigned int atomic_fetch_max(unsigned int * dest, unsigned int val) restrict(amp, cpu);
5671 
5672 extern int atomic_fetch_min(int * dest, int val) restrict(amp, cpu);
5673 extern unsigned int atomic_fetch_min(unsigned int * dest, unsigned int val) restrict(amp, cpu);
5674 #endif
5675 
5691 #if __KALMAR_ACCELERATOR__ == 1
5692 extern "C" unsigned int atomic_inc_unsigned(unsigned int *p) restrict(amp);
5693 extern "C" int atomic_inc_int(int *p) restrict(amp);
5694 
5695 static inline unsigned int atomic_fetch_inc(unsigned int *x) restrict(amp,cpu) {
5696  return atomic_inc_unsigned(x);
5697 }
5698 static inline int atomic_fetch_inc(int *x) restrict(amp,cpu) {
5699  return atomic_inc_int(x);
5700 }
5701 
5702 extern "C" unsigned int atomic_dec_unsigned(unsigned int *p) restrict(amp);
5703 extern "C" int atomic_dec_int(int *p) restrict(amp);
5704 
5705 static inline unsigned int atomic_fetch_dec(unsigned int *x) restrict(amp,cpu) {
5706  return atomic_dec_unsigned(x);
5707 }
5708 static inline int atomic_fetch_dec(int *x) restrict(amp,cpu) {
5709  return atomic_dec_int(x);
5710 }
5711 #elif __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
5712 unsigned int atomic_inc_unsigned(unsigned int *p);
5713 int atomic_inc_int(int *p);
5714 
5715 static inline unsigned atomic_fetch_inc(unsigned *x) restrict(amp,cpu) {
5716  return atomic_inc_unsigned(x);
5717 }
5718 static inline int atomic_fetch_inc(int *x) restrict(amp,cpu) {
5719  return atomic_inc_int(x);
5720 }
5721 
5722 unsigned int atomic_dec_unsigned(unsigned int *p);
5723 int atomic_dec_int(int *p);
5724 
5725 static inline unsigned atomic_fetch_dec(unsigned *x) restrict(amp,cpu) {
5726  return atomic_dec_unsigned(x);
5727 }
5728 static inline int atomic_fetch_dec(int *x) restrict(amp,cpu) {
5729  return atomic_dec_int(x);
5730 }
5731 #else
5732 extern int atomic_fetch_inc(int * _Dest) restrict(amp, cpu);
5733 extern unsigned int atomic_fetch_inc(unsigned int * _Dest) restrict(amp, cpu);
5734 
5735 extern int atomic_fetch_dec(int * _Dest) restrict(amp, cpu);
5736 extern unsigned int atomic_fetch_dec(unsigned int * _Dest) restrict(amp, cpu);
5737 #endif
5738 
5741 // ------------------------------------------------------------------------
5742 // parallel_for_each
5743 // ------------------------------------------------------------------------
5744 
5745 template <int N, typename Kernel>
5746 void parallel_for_each(const accelerator_view&, extent<N> compute_domain, const Kernel& f);
5747 
5748 template <int D0, int D1, int D2, typename Kernel>
5749 void parallel_for_each(const accelerator_view& accl_view,
5750  tiled_extent<D0,D1,D2> compute_domain, const Kernel& f);
5751 
5752 template <int D0, int D1, typename Kernel>
5753 void parallel_for_each(const accelerator_view& accl_view,
5754  tiled_extent<D0,D1> compute_domain, const Kernel& f);
5755 
5756 template <int D0, typename Kernel>
5757 void parallel_for_each(const accelerator_view& accl_view,
5758  tiled_extent<D0> compute_domain, const Kernel& f);
5759 
5760 template <int N, typename Kernel>
5761 void parallel_for_each(extent<N> compute_domain, const Kernel& f){
5762  auto que = Kalmar::get_availabe_que(f);
5763  const accelerator_view av(que);
5764  parallel_for_each(av, compute_domain, f);
5765 }
5766 
5767 template <int D0, int D1, int D2, typename Kernel>
5768 void parallel_for_each(tiled_extent<D0,D1,D2> compute_domain, const Kernel& f) {
5769  auto que = Kalmar::get_availabe_que(f);
5770  const accelerator_view av(que);
5771  parallel_for_each(av, compute_domain, f);
5772 }
5773 
5774 template <int D0, int D1, typename Kernel>
5775 void parallel_for_each(tiled_extent<D0,D1> compute_domain, const Kernel& f) {
5776  auto que = Kalmar::get_availabe_que(f);
5777  const accelerator_view av(que);
5778  parallel_for_each(av, compute_domain, f);
5779 }
5780 
5781 template <int D0, typename Kernel>
5782 void parallel_for_each(tiled_extent<D0> compute_domain, const Kernel& f) {
5783  auto que = Kalmar::get_availabe_que(f);
5784  const accelerator_view av(que);
5785  parallel_for_each(av, compute_domain, f);
5786 }
5787 
5788 template <int N, typename Kernel, typename _Tp>
5790 {
5791  static inline void call(Kernel& k, _Tp& idx) restrict(amp,cpu) {
5792  int i;
5793  for (i = 0; i < k.ext[N - 1]; ++i) {
5794  idx[N - 1] = i;
5796  }
5797  }
5798 };
5799 template <typename Kernel, typename _Tp>
5800 struct pfe_helper<0, Kernel, _Tp>
5801 {
5802  static inline void call(Kernel& k, _Tp& idx) restrict(amp,cpu) {
5803 #if __KALMAR_ACCELERATOR__ == 1
5804  k.k(idx);
5805 #endif
5806  }
5807 };
5808 
5809 template <int N, typename Kernel>
5811 {
5812 public:
5813  explicit pfe_wrapper(extent<N>& other, const Kernel& f) restrict(amp,cpu)
5814  : ext(other), k(f) {}
5815  void operator() (index<N> idx) restrict(amp,cpu) {
5816  pfe_helper<N - 3, pfe_wrapper<N, Kernel>, index<N>>::call(*this, idx);
5817  }
5818 private:
5819  const extent<N> ext;
5820  const Kernel k;
5821  template <int K, typename Ker, typename _Tp>
5822  friend struct pfe_helper;
5823 };
5824 
5825 #pragma clang diagnostic push
5826 #pragma clang diagnostic ignored "-Wunused-variable"
5827 template <int N, typename Kernel>
5828 __attribute__((noinline,used))
5829 void parallel_for_each(const accelerator_view& av, extent<N> compute_domain,
5830  const Kernel& f) restrict(cpu, amp) {
5831 #if __KALMAR_ACCELERATOR__ != 1
5832 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
5833  int* foo1 = reinterpret_cast<int*>(&Kernel::__cxxamp_trampoline);
5834  auto bar = &pfe_wrapper<N, Kernel>::operator();
5836  int* foo = reinterpret_cast<int*>(&pfe_wrapper<N, Kernel>::__cxxamp_trampoline);
5837 #endif
5838  size_t compute_domain_size = 1;
5839  for(int i = 0 ; i < N ; i++)
5840  {
5841  if(compute_domain[i]<=0)
5842  throw invalid_compute_domain("Extent is less or equal than 0.");
5843  if (static_cast<size_t>(compute_domain[i]) > 4294967295L)
5844  throw invalid_compute_domain("Extent size too large.");
5845  compute_domain_size *= static_cast<size_t>(compute_domain[i]);
5846  if (compute_domain_size > 4294967295L)
5847  throw invalid_compute_domain("Extent size too large.");
5848  }
5849 
5850  size_t ext[3] = {static_cast<size_t>(compute_domain[N - 1]),
5851  static_cast<size_t>(compute_domain[N - 2]),
5852  static_cast<size_t>(compute_domain[N - 3])};
5853 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
5854  if (is_cpu()) {
5855  launch_cpu_task(av.pQueue, f, compute_domain);
5856  return;
5857  }
5858 #endif
5859  if (av.get_accelerator().get_device_path() == L"cpu") {
5860  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
5861  }
5862  const pfe_wrapper<N, Kernel> _pf(compute_domain, f);
5863  Kalmar::mcw_cxxamp_launch_kernel<pfe_wrapper<N, Kernel>, 3>(av.pQueue, ext, NULL, _pf);
5864 #else
5865 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
5866  int* foo1 = reinterpret_cast<int*>(&Kernel::__cxxamp_trampoline);
5867 #endif
5868  auto bar = &pfe_wrapper<N, Kernel>::operator();
5870  int* foo = reinterpret_cast<int*>(&pfe_wrapper<N, Kernel>::__cxxamp_trampoline);
5871 #endif
5872 }
5873 #pragma clang diagnostic pop
5874 
5875 #pragma clang diagnostic push
5876 #pragma clang diagnostic ignored "-Wunused-variable"
5877 //1D parallel_for_each, nontiled
5878 template <typename Kernel>
5879 __attribute__((noinline,used)) void parallel_for_each(const accelerator_view& av,
5880  extent<1> compute_domain, const Kernel& f) restrict(cpu,amp) {
5881 #if __KALMAR_ACCELERATOR__ != 1
5882  if(compute_domain[0]<=0) {
5883  throw invalid_compute_domain("Extent is less or equal than 0.");
5884  }
5885  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
5886  throw invalid_compute_domain("Extent size too large.");
5887 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
5888  if (is_cpu()) {
5889  launch_cpu_task(av.pQueue, f, compute_domain);
5890  return;
5891  }
5892 #endif
5893  size_t ext = compute_domain[0];
5894  if (av.get_accelerator().get_device_path() == L"cpu") {
5895  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
5896  }
5897  Kalmar::mcw_cxxamp_launch_kernel<Kernel, 1>(av.pQueue, &ext, NULL, f);
5898 #else //if __KALMAR_ACCELERATOR__ != 1
5899  //to ensure functor has right operator() defined
5900  //this triggers the trampoline code being emitted
5901  auto foo = &Kernel::__cxxamp_trampoline;
5902  auto bar = &Kernel::operator();
5903 #endif
5904 }
5905 #pragma clang diagnostic pop
5906 
5907 #pragma clang diagnostic push
5908 #pragma clang diagnostic ignored "-Wunused-variable"
5909 //2D parallel_for_each, nontiled
5910 template <typename Kernel>
5911 __attribute__((noinline,used)) void parallel_for_each(const accelerator_view& av,
5912  extent<2> compute_domain, const Kernel& f) restrict(cpu,amp) {
5913 #if __KALMAR_ACCELERATOR__ != 1
5914  if(compute_domain[0]<=0 || compute_domain[1]<=0) {
5915  throw invalid_compute_domain("Extent is less or equal than 0.");
5916  }
5917  if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[1]) > 4294967295L)
5918  throw invalid_compute_domain("Extent size too large.");
5919 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
5920  if (is_cpu()) {
5921  launch_cpu_task(av.pQueue, f, compute_domain);
5922  return;
5923  }
5924 #endif
5925  size_t ext[2] = {static_cast<size_t>(compute_domain[1]),
5926  static_cast<size_t>(compute_domain[0])};
5927  if (av.get_accelerator().get_device_path() == L"cpu") {
5928  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
5929  }
5930  Kalmar::mcw_cxxamp_launch_kernel<Kernel, 2>(av.pQueue, ext, NULL, f);
5931 #else //if __KALMAR_ACCELERATOR__ != 1
5932  //to ensure functor has right operator() defined
5933  //this triggers the trampoline code being emitted
5934  auto foo = &Kernel::__cxxamp_trampoline;
5935  auto bar = &Kernel::operator();
5936 #endif
5937 }
5938 #pragma clang diagnostic pop
5939 
5940 #pragma clang diagnostic push
5941 #pragma clang diagnostic ignored "-Wunused-variable"
5942 //3D parallel_for_each, nontiled
5943 template <typename Kernel>
5944 __attribute__((noinline,used)) void parallel_for_each(const accelerator_view& av,
5945  extent<3> compute_domain, const Kernel& f) restrict(cpu,amp) {
5946 #if __KALMAR_ACCELERATOR__ != 1
5947  if(compute_domain[0]<=0 || compute_domain[1]<=0 || compute_domain[2]<=0) {
5948  throw invalid_compute_domain("Extent is less or equal than 0.");
5949  }
5950  if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[1]) > 4294967295L)
5951  throw invalid_compute_domain("Extent size too large.");
5952  if (static_cast<size_t>(compute_domain[1]) * static_cast<size_t>(compute_domain[2]) > 4294967295L)
5953  throw invalid_compute_domain("Extent size too large.");
5954  if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[2]) > 4294967295L)
5955  throw invalid_compute_domain("Extent size too large.");
5956  if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[1]) * static_cast<size_t>(compute_domain[2]) > 4294967295L)
5957  throw invalid_compute_domain("Extent size too large.");
5958 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
5959  if (is_cpu()) {
5960  launch_cpu_task(av.pQueue, f, compute_domain);
5961  return;
5962  }
5963 #endif
5964  size_t ext[3] = {static_cast<size_t>(compute_domain[2]),
5965  static_cast<size_t>(compute_domain[1]),
5966  static_cast<size_t>(compute_domain[0])};
5967  if (av.get_accelerator().get_device_path() == L"cpu") {
5968  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
5969  }
5970  Kalmar::mcw_cxxamp_launch_kernel<Kernel, 3>(av.pQueue, ext, NULL, f);
5971 #else //if __KALMAR_ACCELERATOR__ != 1
5972  //to ensure functor has right operator() defined
5973  //this triggers the trampoline code being emitted
5974  auto foo = &Kernel::__cxxamp_trampoline;
5975  auto bar = &Kernel::operator();
5976 #endif
5977 }
5978 #pragma clang diagnostic pop
5979 
5980 #pragma clang diagnostic push
5981 #pragma clang diagnostic ignored "-Wunused-variable"
5982 //1D parallel_for_each, tiled
5983 template <int D0, typename Kernel>
5984 __attribute__((noinline,used)) void parallel_for_each(const accelerator_view& av,
5985  tiled_extent<D0> compute_domain, const Kernel& f) restrict(cpu,amp) {
5986 #if __KALMAR_ACCELERATOR__ != 1
5987  if(compute_domain[0]<=0) {
5988  throw invalid_compute_domain("Extent is less or equal than 0.");
5989  }
5990  if (static_cast<size_t>(compute_domain[0]) > 4294967295L)
5991  throw invalid_compute_domain("Extent size too large.");
5992  size_t ext = compute_domain[0];
5993  size_t tile = compute_domain.tile_dim0;
5994  static_assert( compute_domain.tile_dim0 <= 1024, "The maximum nuimber of threads in a tile is 1024");
5995  if(ext % tile != 0) {
5996  throw invalid_compute_domain("Extent can't be evenly divisible by tile size.");
5997  }
5998 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
5999  if (is_cpu()) {
6000  launch_cpu_task(av.pQueue, f, compute_domain);
6001  } else
6002 #endif
6003  if (av.get_accelerator().get_device_path() == L"cpu") {
6004  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
6005  }
6006  Kalmar::mcw_cxxamp_launch_kernel<Kernel, 1>(av.pQueue, &ext, &tile, f);
6007 #else //if __KALMAR_ACCELERATOR__ != 1
6008  tiled_index<D0> this_is_used_to_instantiate_the_right_index;
6009  //to ensure functor has right operator() defined
6010  //this triggers the trampoline code being emitted
6011  auto foo = &Kernel::__cxxamp_trampoline;
6012  auto bar = &Kernel::operator();
6013 #endif
6014 }
6015 #pragma clang diagnostic pop
6016 
6017 #pragma clang diagnostic push
6018 #pragma clang diagnostic ignored "-Wunused-variable"
6019 //2D parallel_for_each, tiled
6020 template <int D0, int D1, typename Kernel>
6021 __attribute__((noinline,used)) void parallel_for_each(const accelerator_view& av,
6022  tiled_extent<D0, D1> compute_domain, const Kernel& f) restrict(cpu,amp) {
6023 #if __KALMAR_ACCELERATOR__ != 1
6024  if(compute_domain[0]<=0 || compute_domain[1]<=0) {
6025  throw invalid_compute_domain("Extent is less or equal than 0.");
6026  }
6027  if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[1]) > 4294967295L)
6028  throw invalid_compute_domain("Extent size too large.");
6029  size_t ext[2] = { static_cast<size_t>(compute_domain[1]),
6030  static_cast<size_t>(compute_domain[0])};
6031  size_t tile[2] = { compute_domain.tile_dim1,
6032  compute_domain.tile_dim0};
6033  static_assert( (compute_domain.tile_dim1 * compute_domain.tile_dim0)<= 1024, "The maximum nuimber of threads in a tile is 1024");
6034  if((ext[0] % tile[0] != 0) || (ext[1] % tile[1] != 0)) {
6035  throw invalid_compute_domain("Extent can't be evenly divisible by tile size.");
6036  }
6037 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
6038  if (is_cpu()) {
6039  launch_cpu_task(av.pQueue, f, compute_domain);
6040  } else
6041 #endif
6042  if (av.get_accelerator().get_device_path() == L"cpu") {
6043  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
6044  }
6045  Kalmar::mcw_cxxamp_launch_kernel<Kernel, 2>(av.pQueue, ext, tile, f);
6046 #else //if __KALMAR_ACCELERATOR__ != 1
6047  tiled_index<D0, D1> this_is_used_to_instantiate_the_right_index;
6048  //to ensure functor has right operator() defined
6049  //this triggers the trampoline code being emitted
6050  auto foo = &Kernel::__cxxamp_trampoline;
6051  auto bar = &Kernel::operator();
6052 #endif
6053 }
6054 #pragma clang diagnostic pop
6055 
6056 #pragma clang diagnostic push
6057 #pragma clang diagnostic ignored "-Wunused-variable"
6058 //3D parallel_for_each, tiled
6059 template <int D0, int D1, int D2, typename Kernel>
6060 __attribute__((noinline,used)) void parallel_for_each(const accelerator_view& av,
6061  tiled_extent<D0, D1, D2> compute_domain, const Kernel& f) restrict(cpu,amp) {
6062 #if __KALMAR_ACCELERATOR__ != 1
6063  if(compute_domain[0]<=0 || compute_domain[1]<=0 || compute_domain[2]<=0) {
6064  throw invalid_compute_domain("Extent is less or equal than 0.");
6065  }
6066  if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[1]) > 4294967295L)
6067  throw invalid_compute_domain("Extent size too large.");
6068  if (static_cast<size_t>(compute_domain[1]) * static_cast<size_t>(compute_domain[2]) > 4294967295L)
6069  throw invalid_compute_domain("Extent size too large.");
6070  if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[2]) > 4294967295L)
6071  throw invalid_compute_domain("Extent size too large.");
6072  if (static_cast<size_t>(compute_domain[0]) * static_cast<size_t>(compute_domain[1]) * static_cast<size_t>(compute_domain[2]) > 4294967295L)
6073  throw invalid_compute_domain("Extent size too large.");
6074  size_t ext[3] = { static_cast<size_t>(compute_domain[2]),
6075  static_cast<size_t>(compute_domain[1]),
6076  static_cast<size_t>(compute_domain[0])};
6077  size_t tile[3] = { compute_domain.tile_dim2,
6078  compute_domain.tile_dim1,
6079  compute_domain.tile_dim0};
6080  static_assert(( compute_domain.tile_dim2 * compute_domain.tile_dim1* compute_domain.tile_dim0)<= 1024, "The maximum nuimber of threads in a tile is 1024");
6081  if((ext[0] % tile[0] != 0) || (ext[1] % tile[1] != 0) || (ext[2] % tile[2] != 0)) {
6082  throw invalid_compute_domain("Extent can't be evenly divisible by tile size.");
6083  }
6084 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
6085  if (is_cpu()) {
6086  launch_cpu_task(av.pQueue, f, compute_domain);
6087  } else
6088 #endif
6089  if (av.get_accelerator().get_device_path() == L"cpu") {
6090  throw runtime_exception(Kalmar::__errorMsg_UnsupportedAccelerator, E_FAIL);
6091  }
6092  Kalmar::mcw_cxxamp_launch_kernel<Kernel, 3>(av.pQueue, ext, tile, f);
6093 #else //if __KALMAR_ACCELERATOR__ != 1
6094  tiled_index<D0, D1, D2> this_is_used_to_instantiate_the_right_index;
6095  //to ensure functor has right operator() defined
6096  //this triggers the trampoline code being emitted
6097  auto foo = &Kernel::__cxxamp_trampoline;
6098  auto bar = &Kernel::operator();
6099 #endif
6100 }
6101 #pragma clang diagnostic pop
6102 
6103 } // namespace Concurrency
Definition: amp.h:4690
bool operator==(const accelerator_view &other) const
Compares "this" accelerator_view with the passed accelerator_view object to determine if they represe...
Definition: amp.h:204
T & operator()(const index< N > &idx)
Returns a reference to the element of this array that is at the location in N-dimensional space speci...
Definition: amp.h:3026
array(const extent< N > &ext)
Constructs a new array with the supplied extent, located on the default view of the default accelerat...
Definition: amp.h:2515
unsigned int atomic_compare_exchange(unsigned int *dest, unsigned int *expected_val, unsigned int val)
These functions attempt to perform these three steps atomically:
tiled_extent< D0 > tile() const
Produces a tiled_extent object with the tile extents given by D0, D1, and D2.
Definition: amp.h:922
completion_future synchronize_async() const
An asynchronous version of synchronize, which returns a completion future object. ...
Definition: amp.h:3664
extent & operator%=(int value)
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: amp.h:1051
array(const array_view< const T, N > &src, accelerator_view av, accelerator_view associated_av)
Constructs a staging array initialized with the array_view given by "src", which acts as a staging ar...
Definition: amp.h:2841
void copy_to(array< T, N > &dest) const
Copies the data referred to by this array_view to the array given by "dest", as if by calling "copy(*...
Definition: amp.h:4197
array_view< T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2)
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: amp.h:3221
array_view & operator=(const array_view &other)
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: amp.h:4177
array_view(const array< T, N > &src)
Constructs an array_view which is bound to the data contained in the "src" array. ...
Definition: amp.h:4045
array(int e0, int e1, int e2)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:2529
array_view< const T, N > section(const Concurrency::index< N > &idx, const Concurrency::extent< N > &ext) const
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: amp.h:4426
Definition: amp.h:4748
tiled_extent(const tiled_extent &other)
Copy constructor.
Definition: amp.h:1849
array(const Concurrency::extent< N > &ext, InputIter srcBegin, InputIter srcEnd)
Constructs a new array with the supplied extent, located on the default accelerator, initialized with the contents of a source container specified by a beginning and optional ending iterator.
Definition: amp.h:2552
array_view(const extent< N > &extent, const Container &src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: amp.h:4065
array(const Concurrency::extent< N > &ext, InputIter srcBegin)
Constructs a new array with the supplied extent, located on the default accelerator, initialized with the contents of a source container specified by a beginning and optional ending iterator.
Definition: amp.h:2549
array_view< const T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:4469
const Concurrency::extent< 1 > tile_extent
Returns an instance of an extent<N> that captures the values of the tiled_index template arguments D0...
Definition: amp.h:1522
array(const Concurrency::extent< N > &ext, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator_vi...
Definition: amp.h:2813
void copy_to(const array_view< T, N > &dest) const
Copies the contents of this array to the array_view given by "dest", as if by calling "copy(*this...
Definition: amp.h:2975
tiled_extent()
Default constructor.
Definition: amp.h:1728
void synchronize_to(const accelerator_view &av) const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: amp.h:4295
completion_future()
Default constructor.
Definition: amp.h:579
array_view< T, K > view_as(Concurrency::extent< K > viewExtent) const
This member function is similar to "array<T,N>::view_as", although it only supports array_views of ra...
Definition: amp.h:3934
Definition: amp.h:4720
T value_type
The element type of this array.
Definition: amp.h:3395
extent(int components[])
Constructs an extent<N> with the coordinate values provided the array of int component values...
Definition: amp.h:856
accelerator_view create_view(queuing_mode qmode=queuing_mode_automatic)
Creates and returns a new accelerator view on the accelerator with the supplied queuing mode...
Definition: amp.h:427
std::future_status wait_for(const std::chrono::duration< _Rep, _Period > &_Rel_time) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: amp.h:675
accelerator_view get_accelerator_view() const
This property returns the accelerator_view representing the location where this array has been alloca...
Definition: amp.h:2889
extent & operator--()
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: amp.h:1075
extent & operator+=(const extent &__r)
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: amp.h:971
const T & operator()(int i0, int i1, int i2) const
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: amp.h:3086
void wait_with_global_memory_fence() const
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: amp.h:1222
array_view(int e0, const value_type *src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: amp.h:4115
Definition: amp.h:3348
const index< 1 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: amp.h:1477
array(int e0)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:2525
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: amp.h:1628
void copy(const array_view< const T, N > &src, const array_view< T, N > &dest)
The contents of "src" are copied into "dest".
Definition: amp.h:4979
void synchronize() const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: amp.h:3651
bool get_supports_limited_double_precision() const
Returns a boolean value indicating whether the accelerator has limited double precision support (excl...
Definition: amp.h:526
void copy_to(const array_view< T, N > &dest) const
Copies the contents of this array_view to the array_view given by "dest", as if by calling "copy(*thi...
Definition: amp.h:4206
const index< 2 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: amp.h:1611
bool contains(const index< N > &idx) const
Tests whether the index "idx" is properly contained within this extent (with an assumed origin of zer...
Definition: amp.h:895
array_view< const T, K > view_as(const Concurrency::extent< K > &viewExtent) const
An array of higher rank can be reshaped into an array of lower rank, or vice versa, using the view_as member function.
Definition: amp.h:3303
void copy(const array_view< const T, 1 > &src, const array_view< T, 1 > &dest)
The contents of "src" are copied into "dest".
Definition: amp.h:5042
array_view< ElementType, N > reinterpret_as() const
This member function is similar to "array<T,N>::reinterpret_as", although it only supports array_view...
Definition: amp.h:3909
array_view< const ElementType, N > reinterpret_as() const
This member function is similar to "array<T,N>::reinterpret_as", although it only supports array_view...
Definition: amp.h:4488
accelerator_view(const accelerator_view &other)
Copy-constructs an accelerator_view object.
Definition: amp.h:82
accelerator_view get_source_accelerator_view() const
Access the accelerator_view where the data source of the array_view is located.
Definition: amp.h:4157
array(int e0, int e1, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: amp.h:2574
T & operator()(int i0, int i1, int i2)
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: amp.h:3069
const T & operator[](const index< N > &idx) const
Returns a const reference to the element of this array that is at the location in N-dimensional space...
Definition: amp.h:3043
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: amp.h:2746
const index< 3 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: amp.h:1342
array_projection_helper< T, N >::result_type operator()(int i0)
This overload is defined for array<T,N> where .
Definition: amp.h:3114
bool get_has_display() const
This property indicates that the accelerator may be shared by (and thus have interference from) the o...
Definition: amp.h:504
array(const array_view< const T, N > &src, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array initialized with the contents of the array_view "src".
Definition: amp.h:2720
array(int e0, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: amp.h:2643
array_view(const extent< N > &ext, const value_type *src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: amp.h:4079
std::wstring get_description() const
Returns a short textual description of the accelerator device.
Definition: amp.h:486
bool set_default_cpu_access_type(access_type default_cpu_access_type)
Sets the default_cpu_access_type for this accelerator.
Definition: amp.h:471
extent< N > operator/(const extent< N > &ext, int value)
Binary arithmetic operations that produce a new extent<N> that is the result of performing the corres...
Definition: amp.h:4651
array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: amp.h:2872
int value_type
The element type of extent<N>.
Definition: amp.h:796
array(int e0, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: amp.h:2568
extent operator++(int)
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: amp.h:1070
bool operator!=(const accelerator &other) const
Compares "this" accelerator with the passed accelerator object to determine if they represent differe...
Definition: amp.h:451
std::future_status wait_until(const std::chrono::time_point< _Clock, _Duration > &_Abs_time) const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: amp.h:680
void synchronize_to(const accelerator_view &av) const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: amp.h:3703
Represents a logical (isolated) accelerator view of a compute accelerator.
Definition: amp.h:73
tiled_extent()
Default constructor.
Definition: amp.h:1948
void synchronize() const
Calling this member function synchronizes any modifications made to the data underlying "this" array_...
Definition: amp.h:4267
STL namespace.
array(int e0, int e1, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: amp.h:2645
completion_future & operator=(const completion_future &other)
Copy assignment.
Definition: amp.h:611
bool operator!=(const extent &other) const
Compares two objects of extent<N>.
Definition: amp.h:957
array_view< T, 2 > section(int i0, int i1, int e0, int e1) const
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:3885
C++ AMP namespace.
Definition: amp.h:25
extent< N > get_extent() const
Access the extent that defines the shape of this array_view.
Definition: amp.h:3529
This class is the return type of all C++ AMP asynchronous APIs and has an interface analogous to std:...
Definition: amp.h:571
extent(_Tp...__t)
Constructs an extent<N> with the coordinate values provided by .
Definition: amp.h:828
std::wstring get_device_path() const
Returns a system-wide unique device instance path that matches the "Device Instance Path" property fo...
Definition: amp.h:481
Definition: kalmar_exception.h:51
extent & operator-=(const index< N > &idx)
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: amp.h:1019
tiled_extent()
Default constructor.
Definition: amp.h:1840
void global_memory_fence(const tile_barrier &)
Establishes a thread-tile scoped memory fence for global (but not tile-static) memory operations...
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: amp.h:2875
extent & operator*=(const extent &__r)
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: amp.h:981
array_view< const T, K > view_as(Concurrency::extent< K > viewExtent) const
This member function is similar to "array<T,N>::view_as", although it only supports array_views of ra...
Definition: amp.h:4511
T & operator[](const index< N > &idx)
Returns a reference to the element of this array that is at the location in N-dimensional space speci...
Definition: amp.h:3017
array_view & operator=(const array_view &other)
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: amp.h:3550
array(const array &other)
Copy constructor.
Definition: amp.h:2493
array(const Concurrency::extent< N > &ext, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator vi...
Definition: amp.h:2769
Definition: amp.h:4786
array_view(int e0, value_type *src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: amp.h:3494
tiled_extent truncate() const
Returns a new tiled_extent with the extents adjusted down to be evenly divisible by the tile dimensio...
Definition: amp.h:1886
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: amp.h:2577
tiled_extent(const extent< 3 > &ext)
Constructs a tiled_extent<N> with the extent "ext".
Definition: amp.h:1746
extent< N > operator+(const extent< N > &lhs, const extent< N > &rhs)
Adds (or subtracts) two objects of extent<N> to form a new extent.
Definition: amp.h:4584
unsigned int atomic_exchange(unsigned int *dest, unsigned int val)
Atomically read the value stored in dest , replace it with the value given in val and return the old ...
array(int e0, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: amp.h:2571
namespace for internal classes of Kalmar compiler / runtime
Definition: hc.hpp:42
array_view< ElementType, 1 > reinterpret_as()
Sometimes it is desirable to view the data of an N-dimensional array as a linear array, possibly with a (unsafe) reinterpretation of the element type.
Definition: amp.h:3254
array_view< T, N > section(const extent< N > &ext)
Equivalent to "section(index<N>(), ext)".
Definition: amp.h:3184
Represents a set of related indices subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: amp.h:1301
void wait()
Performs a blocking wait for completion of all commands submitted to the accelerator view prior to ca...
Definition: amp.h:151
unsigned atomic_fetch_sub(unsigned *x, unsigned y)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
tiled_index(const tiled_index< D0, D1, D2 > &o)
Copy constructor.
Definition: amp.h:1323
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: amp.h:2752
extent & operator-=(int value)
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: amp.h:1039
unsigned int size() const
This member function returns the total linear size of this extent<N> (in units of elements)...
Definition: amp.h:904
array_view< T, N > section(const index< N > &idx)
Equivalent to "section(idx, this->extent – idx)".
Definition: amp.h:3165
array(int e0, int e1, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: amp.h:2866
accelerator(const accelerator &other)
Copy constructs an accelerator object.
Definition: amp.h:344
The tile_barrier class is a capability class that is only creatable by the system, and passed to a tiled parallel_for_each function object as part of the tiled_index parameter.
Definition: amp.h:1152
void refresh() const
Calling this member function informs the array_view that its bound memory has been modified outside t...
Definition: amp.h:4236
array_view< const T, N > section(const Concurrency::extent< N > &ext) const
Equivalent to "section(index<N>(), ext)".
Definition: amp.h:4444
array(int e0, int e1, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: amp.h:2869
const Concurrency::extent< 2 > tile_extent
Returns an instance of an extent<N> that captures the values of the tiled_index template arguments D0...
Definition: amp.h:1656
tiled_extent< D0, D1 > tile() const
Produces a tiled_extent object with the tile extents given by D0, D1, and D2.
Definition: amp.h:928
access_type get_default_cpu_access_type() const
Get the default cpu access_type for buffers created on this accelerator.
Definition: amp.h:550
extent & operator/=(const extent &__r)
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: amp.h:986
static std::vector< accelerator > get_all()
Returns a std::vector of accelerator objects (in no specific order) representing all accelerators tha...
Definition: amp.h:353
const T & operator()(int i0, int i1) const
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]])) const".
Definition: amp.h:3083
tiled_extent(const extent< 2 > &ext)
Constructs a tiled_extent<N> with the extent "ext".
Definition: amp.h:1858
array_view< T, 2 > section(int i0, int i1, int e0, int e1) const
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: amp.h:3213
accelerator_view get_default_view() const
Returns the default accelerator_view associated with the accelerator.
Definition: amp.h:417
Definition: kalmar_exception.h:22
bool get_is_emulated() const
Returns a boolean value indicating whether the accelerator is emulated.
Definition: amp.h:539
extent & operator+=(int value)
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: amp.h:1035
array_projection_helper< T, N >::const_result_type operator()(int i0) const
This overload is defined for array<T,N> where .
Definition: amp.h:3122
bool get_is_debug() const
Returns a boolean value indicating whether the accelerator supports debugging.
Definition: amp.h:533
extent & operator*=(int value)
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: amp.h:1043
accelerator()
Constructs a new accelerator object that represents the default accelerator.
Definition: amp.h:317
static accelerator_view get_auto_selection_view()
Returns an accelerator_view which when passed as the first argument to a parallel_for_each call cause...
Definition: amp.h:393
T * data() const
Returns a pointer to the raw data underlying this array.
Definition: amp.h:2984
tiled_extent pad() const
Returns a new tiled_extent with the extents adjusted up to be evenly divisible by the tile dimensions...
Definition: amp.h:1874
Definition: amp.h:5789
array_view< T, N > section(const Concurrency::extent< N > &ext) const
Equivalent to "section(index<N>(), ext)".
Definition: amp.h:3865
array_view(int e0)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:3509
Definition: amp.h:2217
Definition: kalmar_exception.h:42
array(int e0, int e1, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: amp.h:2743
const index< 2 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: amp.h:1605
array(int e0, int e1, int e2, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: amp.h:2792
extent(const extent &other)
Copy constructor.
Definition: amp.h:812
array_view< T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:3890
unsigned atomic_fetch_add(unsigned *x, unsigned y)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
completion_future & operator=(completion_future &&other)
Move assignment.
Definition: amp.h:628
void then(const functor &func)
This method enables specification of a completion callback func which is executed upon completion of ...
Definition: amp.h:706
T * data() const
Returns a pointer to the first data element underlying this array_view.
Definition: amp.h:3602
int atomic_fetch_max(int *dest, int val)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
void wait() const
These methods are functionally identical to the corresponding std::shared_future<void> methods...
Definition: amp.h:669
extent< N > operator*(const extent< N > &ext, int value)
Binary arithmetic operations that produce a new extent<N> that is the result of performing the corres...
Definition: amp.h:4639
const Concurrency::extent< 3 > tile_extent
Returns an instance of an extent<N> that captures the values of the tiled_index template arguments D0...
Definition: amp.h:1381
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: amp.h:1353
const index< 1 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: amp.h:1471
array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: amp.h:2740
extent< N > operator-(const extent< N > &lhs, const extent< N > &rhs)
Adds (or subtracts) two objects of extent<N> to form a new extent.
Definition: amp.h:4590
array_view< T, N > section(const Concurrency::index< N > &idx, const Concurrency::extent< N > &ext) const
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: amp.h:3843
extent< N > get_extent() const
Access the extent that defines the shape of this array_view.
Definition: amp.h:4146
bool operator==(const extent &other) const
Compares two objects of extent<N>.
Definition: amp.h:954
tiled_extent(const tiled_extent &other)
Copy constructor.
Definition: amp.h:1957
array_view< const T, N > section(const extent< N > &ext) const
Equivalent to "section(index<N>(), ext)".
Definition: amp.h:3188
accelerator & operator=(const accelerator &other)
Assigns an accelerator object to "this" accelerator object and returns a reference to "this" object...
Definition: amp.h:406
void copy_to(array< T, N > &dest) const
Copies the data referred to by this array_view to the array given by "dest", as if by calling "copy(*...
Definition: amp.h:3568
extent & operator++()
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: amp.h:1066
array_view< T, K > view_as(const Concurrency::extent< K > &viewExtent)
An array of higher rank can be reshaped into an array of lower rank, or vice versa, using the view_as member function.
Definition: amp.h:3294
bool get_supports_double_precision() const
Returns a Boolean value indicating whether this accelerator supports double-precision (double) comput...
Definition: amp.h:518
Concurrency::extent< 3 > get_tile_extent() const
Returns an instance of an extent<N> that captures the values of the tiled_index template arguments D0...
Definition: amp.h:1378
array_view< const T, N > section(const Concurrency::index< N > &idx) const
Equivalent to "section(idx, this->extent – idx)".
Definition: amp.h:4435
tiled_extent truncate() const
Returns a new tiled_extent with the extents adjusted down to be evenly divisible by the tile dimensio...
Definition: amp.h:1775
array_view(const array_view &other)
Copy constructor.
Definition: amp.h:3523
completion_future create_marker()
This command inserts a marker event into the accelerator_view&#39;s command queue.
Definition: amp.h:768
array_view(int e0, Container &src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: amp.h:3475
array_view(const Concurrency::extent< N > &ext, value_type *src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: amp.h:3444
array(const Concurrency::extent< N > &ext, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Constructs a staging array with the given extent, which acts as a staging area between accelerator_vi...
Definition: amp.h:2816
extent(const int components[])
Constructs an extent<N> with the coordinate values provided the array of int component values...
Definition: amp.h:844
completion_future synchronize_async() const
An asynchronous version of synchronize, which returns a completion future object. ...
Definition: amp.h:4279
array_view & operator=(const array_view< T, N > &other)
Assigns the contents of the array_view "other" to this array_view, using a shallow copy...
Definition: amp.h:4168
array(int e0, int e1, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: amp.h:2790
The partial specialization array_view<const T,N> represents a view over elements of type const T with...
Definition: amp.h:4011
Represents a unique position in N-dimensional space.
Definition: amp.h:31
array(int e0, int e1, int e2, InputIter srcBegin, InputIter srcEnd)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: amp.h:2583
Represents an N-dimensional region of memory (with type T) located on an accelerator.
Definition: amp.h:30
const T & operator()(const index< N > &idx) const
Returns a const reference to the element of this array that is at the location in N-dimensional space...
Definition: amp.h:3052
array_view< T, 2 > section(int i0, int i1, int e0, int e1)
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: amp.h:3217
const index< 1 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: amp.h:1489
array_view(const array_view &other)
Copy constructor.
Definition: amp.h:4130
array(const extent< N > &ext, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array with the supplied extent, located on the accelerator bound to the accelerator_...
Definition: amp.h:2625
tiled_index(const tiled_index< D0, D1 > &o)
Copy constructor.
Definition: amp.h:1598
const index< 3 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: amp.h:1348
accelerator_view get_associated_accelerator_view() const
This property returns the accelerator_view representing the preferred target where this array can be ...
Definition: amp.h:2895
array_view< const T, 1 > section(int i0, int e0) const
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:4459
array(const array_view< const T, N > &src)
Constructs a new array, located on the default view of the default accelerator, initialized with the ...
Definition: amp.h:2599
tiled_extent< D0, D1, D2 > tile() const
Produces a tiled_extent object with the tile extents given by D0, D1, and D2.
Definition: amp.h:934
array_view< T, 1 > section(int i0, int e0) const
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:3880
array(int e0, int e1, int e2, InputIter srcBegin)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src)".
Definition: amp.h:2580
array(array &&other)
Move constructor.
Definition: amp.h:2504
array(int e0, int e1, int e2, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, cpu_access_type)".
Definition: amp.h:2647
Represents a unique position in N-dimensional space.
Definition: kalmar_index.h:226
static const int tile_dim2
These constants allow access to the template arguments of tiled_extent.
Definition: amp.h:1804
bool get_is_auto_selection()
Returns a boolean value indicating whether the accelerator view when passed to a parallel_for_each wo...
Definition: amp.h:119
tile_barrier(const tile_barrier &other)
Copy constructor.
Definition: amp.h:1175
Concurrency::extent< 1 > get_tile_extent() const
Returns an instance of an extent<N> that captures the values of the tiled_index template arguments D0...
Definition: amp.h:1519
extent & operator/=(int value)
For a given operator , produces the same effect as (*this) = (*this) value.
Definition: amp.h:1047
extent< N > operator%(const extent< N > &ext, int value)
Binary arithmetic operations that produce a new extent<N> that is the result of performing the corres...
Definition: amp.h:4663
array_view< T, N > section(const Concurrency::index< N > &origin, const Concurrency::extent< N > &ext)
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: amp.h:3146
array(const Concurrency::extent< N > &ext, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Constructs a new array with the supplied extent, located on the accelerator bound to the accelerator_...
Definition: amp.h:2680
Represents an extent subdivided into 1-, 2-, or 3-dimensional tiles.
Definition: amp.h:32
void all_memory_fence(const tile_barrier &)
Establishes a thread-tile scoped memory fence for both global and tile-static memory operations...
array_view< const T, N > section(const Concurrency::index< N > &origin, const Concurrency::extent< N > &ext) const
Returns a subsection of the source array view at the origin specified by "idx" and with the extent sp...
Definition: amp.h:3154
unsigned int get_version() const
Returns a 32-bit unsigned integer representing the version number of this accelerator.
Definition: amp.h:494
static bool set_default(const std::wstring &path)
Sets the default accelerator to the device path identified by the "path" argument.
Definition: amp.h:374
void copy_to(const array_view &dest) const
Copies the contents of this array_view to the array_view given by "dest", as if by calling "copy(*thi...
Definition: amp.h:3585
tiled_extent(const extent< 1 > &ext)
Constructs a tiled_extent<N> with the extent "ext".
Definition: amp.h:1966
const index< 2 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: amp.h:1617
void wait_with_tile_static_memory_fence() const
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: amp.h:1240
bool operator!=(const accelerator_view &other) const
Compares "this" accelerator_view with the passed accelerator_view object to determine if they represe...
Definition: amp.h:216
array_view< T, 1 > section(int i0, int e0)
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: amp.h:3205
Concurrency::extent< 2 > get_tile_extent() const
Returns an instance of an extent<N> that captures the values of the tiled_index template arguments D0...
Definition: amp.h:1653
T & operator()(int i0, int i1)
Equivalent to "array<T,N>::operator()(index<N>(i0 [, i1 [, i2 ]]))".
Definition: amp.h:3066
tiled_index(const tiled_index< D0 > &o)
Copy constructor.
Definition: amp.h:1464
tiled_extent pad() const
Returns a new tiled_extent with the extents adjusted up to be evenly divisible by the tile dimensions...
Definition: amp.h:1982
completion_future(completion_future &&other)
Move constructor.
Definition: amp.h:600
extent & operator-=(const extent &__r)
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: amp.h:975
accelerator get_accelerator() const
Returns the accelerator that this accelerator_view has been created on.
Definition: amp.h:766
extent & operator%=(const extent &__r)
Adds (or subtracts) an object of type extent<N> from this extent to form a new extent.
Definition: amp.h:991
queuing_mode get_queuing_mode() const
Returns the queuing mode that this accelerator_view was created with.
Definition: amp.h:105
static const int tile_dim0
These constants allow access to the template arguments of tiled_extent.
Definition: amp.h:1802
Definition: amp.h:3337
completion_future copy_async(const array< T, N > &src, array< T, N > &dest)
The contents of "src" are copied into "dest".
Definition: amp.h:5178
const index< 1 > tile
An index of rank 1, 2, or 3 that represents the coordinates of the current tile of a tiled extent...
Definition: amp.h:1483
bool valid() const
This method is functionally identical to std::shared_future<void>::valid.
Definition: amp.h:652
void refresh() const
Calling this member function informs the array_view that its bound memory has been modified outside t...
Definition: amp.h:3615
extent operator-(const index< N > &idx)
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: amp.h:1010
unsigned int get_version() const
Returns a 32-bit unsigned integer representing the version number of this accelerator view...
Definition: amp.h:770
const index< 3 > global
An index of rank 1, 2, or 3 that represents the global index within an extent.
Definition: amp.h:1330
unsigned atomic_fetch_and(unsigned *x, unsigned y)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
void tile_static_memory_fence(const tile_barrier &)
Establishes a thread-tile scoped memory fence for tile-static (but not global) memory operations...
array(int e0, int e1, int e2, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: amp.h:2749
extent(int e0)
Constructs an extent<N> with the coordinate values provided by .
Definition: amp.h:824
tiled_extent(const tiled_extent &other)
Copy constructor.
Definition: amp.h:1737
array_view(const Concurrency::extent< N > &extent, Container &src)
Constructs an array_view which is bound to the data contained in the "src" container.
Definition: amp.h:3430
void wait_with_all_memory_fence() const
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: amp.h:1205
array_view< const T, 2 > section(int i0, int i1, int e0, int e1) const
Equivalent to "section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:4464
unsigned atomic_fetch_or(unsigned *x, unsigned y)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
extent()
Default constructor.
Definition: amp.h:802
array(int e0, InputIter srcBegin, InputIter srcEnd, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: amp.h:2863
Represents a physical accelerated computing device.
Definition: amp.h:282
const T * data() const
Returns a pointer to the first data element underlying this array_view.
Definition: amp.h:4223
tiled_extent truncate() const
Returns a new tiled_extent with the extents adjusted down to be evenly divisible by the tile dimensio...
Definition: amp.h:1993
int atomic_fetch_inc(int *_Dest)
Atomically increment or decrement the value stored at the location point to by dest.
array(int e0, int e1)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]))".
Definition: amp.h:2527
completion_future(const completion_future &other)
Copy constructor.
Definition: amp.h:588
array_view(const Concurrency::extent< N > &ext)
Constructs an array_view which is not bound to a data source.
Definition: amp.h:3461
array & operator=(const array &other)
Assigns the contents of the array "other" to this array, using a deep copy.
Definition: amp.h:2910
accelerator_view get_source_accelerator_view() const
Access the accelerator_view where the data source of the array_view is located.
Definition: amp.h:3540
array & operator=(const array_view< T, N > &src)
Assigns the contents of the array_view "src", as if by calling "copy(src, *this)".
Definition: amp.h:2942
array_view< const T, 1 > section(int i0, int e0) const
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: amp.h:3209
Definition: amp.h:5810
array(int e0, InputIter srcBegin, accelerator_view av, access_type cpu_access_type=access_type_auto)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), srcBegin [, srcEnd]...
Definition: amp.h:2737
bool operator==(const accelerator &other) const
Compares "this" accelerator with the passed accelerator object to determine if they represent the sam...
Definition: amp.h:441
accelerator_view & operator=(const accelerator_view &other)
Assigns an accelerator_view object to "this" accelerator_view object and returns a reference to "this...
Definition: amp.h:94
The array_view<T,N> type represents a possibly cached view into the data held in an array<T...
Definition: amp.h:29
extent operator--(int)
For a given operator , produces the same effect as (*this) = (*this) 1.
Definition: amp.h:1079
const index< 3 > local
An index of rank 1, 2, or 3 that represents the relative index within the current tile of a tiled ext...
Definition: amp.h:1336
access_type get_cpu_access_type() const
This property returns the CPU "access_type" allowed for this array.
Definition: amp.h:2900
void wait() const
Blocks execution of all threads in the thread tile until all threads in the tile have reached this ca...
Definition: amp.h:1188
array_view< T, N > section(const Concurrency::index< N > &idx) const
Equivalent to "section(idx, this->extent – idx)".
Definition: amp.h:3856
void discard_data() const
Indicates to the runtime that it may discard the current logical contents of this array_view...
Definition: amp.h:3734
array_view< const ElementType, 1 > reinterpret_as() const
Sometimes it is desirable to view the data of an N-dimensional array as a linear array, possibly with a (unsafe) reinterpretation of the element type.
Definition: amp.h:3267
void copy_to(array &dest) const
Copies the contents of this array to the array given by "dest", as if by calling "copy(*this, dest)".
Definition: amp.h:2956
tiled_extent pad() const
Returns a new tiled_extent with the extents adjusted up to be evenly divisible by the tile dimensions...
Definition: amp.h:1762
T value_type
The element type of this array.
Definition: amp.h:2478
unsigned atomic_fetch_xor(unsigned *x, unsigned y)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
array_view(array< T, N > &src)
Constructs an array_view which is bound to the data contained in the "src" array. ...
Definition: amp.h:3410
int atomic_fetch_dec(int *_Dest)
Atomically increment or decrement the value stored at the location point to by dest.
const tile_barrier barrier
An object which represents a barrier within the current tile of threads.
Definition: amp.h:1494
array_view< const T, 3 > section(int i0, int i1, int i2, int e0, int e1, int e2) const
Equivalent to "array<T,N>::section(index<N>(i0 [, i1 [, i2 ]]), extent<N>(e0 [, e1 [...
Definition: amp.h:3225
accelerator(const std::wstring &path)
Constructs a new accelerator object that represents the physical device named by the "path" argument...
Definition: amp.h:334
extent operator+(const index< N > &idx)
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: amp.h:1005
array(int e0, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), av, associated_av)".
Definition: amp.h:2788
const T value_type
The element type of this array.
Definition: amp.h:4030
array_view(int e0, Container &src)
Equivalent to construction using "array_view(extent<N>(e0 [, e1 [, e2 ]]), src)". ...
Definition: amp.h:4097
array(int e0, InputIter srcBegin, accelerator_view av, accelerator_view associated_av)
Equivalent to construction using "array(extent<N>(e0 [, e1 [, e2 ]]), src, av, associated_av)".
Definition: amp.h:2860
const index< 2 > tile_origin
An index of rank 1, 2, or 3 that represents the global coordinates of the origin of the current tile ...
Definition: amp.h:1623
void flush()
Sends the queued up commands in the accelerator_view to the device for execution. ...
Definition: amp.h:182
array_view< const T, N > section(const index< N > &idx) const
Equivalent to "section(idx, this->extent – idx)".
Definition: amp.h:3173
bool get_supports_cpu_shared_memory() const
Returns a boolean value indicating whether the accelerator supports memory accessible both by the acc...
Definition: amp.h:545
size_t get_dedicated_memory() const
Returns the amount of dedicated memory (in KB) on an accelerator device.
Definition: amp.h:511
bool get_is_debug() const
Returns a boolean value indicating whether the accelerator_view supports debugging through extensive ...
Definition: amp.h:145
Definition: amp.h:3359
int atomic_fetch_min(int *dest, int val)
Atomically read the value stored in dest, apply the binary numerical operation specific to the functi...
static const int tile_dim1
These constants allow access to the template arguments of tiled_extent.
Definition: amp.h:1803
extent & operator=(const extent &other)
Assigns the component values of "other" to this extent<N> object.
Definition: amp.h:866
array_view(const array_view< nc_T, N > &other)
Copy constructor.
Definition: amp.h:4140
Definition: kalmar_runtime.h:14
array & operator=(array &&other)
Moves the contents of the array "other" to this array.
Definition: amp.h:2925
Concurrency::extent< N > get_extent() const
Access the extent that defines the shape of this array.
Definition: amp.h:2883
extent & operator+=(const index< N > &idx)
Adds (or subtracts) an object of type index<N> from this extent to form a new extent.
Definition: amp.h:1015