HCC
HCC is a single-source, C/C++ compiler for heterogeneous computing. It's optimized with HSA (http://www.hsafoundation.com/).
kalmar_runtime.h
1 #pragma once
2 
3 #include "hc_defines.h"
4 #include "kalmar_aligned_alloc.h"
5 
6 namespace hc {
7 class AmPointerInfo;
8 class completion_future;
9 }; // end namespace hc
10 
11 typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t;
12 
13 namespace Kalmar {
14 namespace enums {
15 
19 enum access_type
20 {
21  access_type_none = 0,
22  access_type_read = (1 << 0),
23  access_type_write = (1 << 1),
24  access_type_read_write = access_type_read | access_type_write,
25  access_type_auto = (1 << 31)
26 };
27 
28 enum queuing_mode
29 {
30  queuing_mode_immediate,
31  queuing_mode_automatic
32 };
33 
34 enum execute_order
35 {
36  execute_in_order,
37  execute_any_order
38 };
39 
40 
41 // Flags to specify visibility of previous commands after a marker is executed.
42 enum memory_scope
43 {
44  no_scope=0, // No release operation applied
45  accelerator_scope=1, // Release to current accelerator
46  system_scope=2, // Release to system (CPU + all accelerators)
47 };
48 
49 static inline memory_scope greater_scope(memory_scope scope1, memory_scope scope2)
50 {
51  if ((scope1==system_scope) || (scope2 == system_scope)) {
52  return system_scope;
53  } else if ((scope1==accelerator_scope) || (scope2 == accelerator_scope)) {
54  return accelerator_scope;
55  } else {
56  return no_scope;
57  }
58 }
59 
60 
61 enum hcCommandKind {
62  hcCommandInvalid= -1,
63 
64  hcMemcpyHostToHost = 0,
65  hcMemcpyHostToDevice = 1,
66  hcMemcpyDeviceToHost = 2,
67  hcMemcpyDeviceToDevice = 3,
68  hcCommandKernel = 4,
69  hcCommandMarker = 5,
70 };
71 
72 
73 // Commands sent to copy queues:
74 static inline bool isCopyCommand(hcCommandKind k)
75 {
76  switch (k) {
77  case hcMemcpyHostToHost:
78  case hcMemcpyHostToDevice:
79  case hcMemcpyDeviceToHost:
80  case hcMemcpyDeviceToDevice:
81  return true;
82  default:
83  return false;
84  };
85 };
86 
87 
88 // Commands sent to compute queue:
89 static inline bool isComputeQueueCommand(hcCommandKind k) {
90  return (k == hcCommandKernel) || (k == hcCommandMarker);
91 };
92 
93 
94 
95 
96 enum hcWaitMode {
97  hcWaitModeBlocked = 0,
98  hcWaitModeActive = 1
99 };
100 
101 enum hcAgentProfile {
102  hcAgentProfileNone = 0,
103  hcAgentProfileBase = 1,
104  hcAgentProfileFull = 2
105 };
106 
107 } // namespace enums
108 } // namespace Kalmar
109 
110 
112 namespace Kalmar {
113 
114 using namespace Kalmar::enums;
115 
117 class KalmarDevice;
118 class KalmarQueue;
119 struct rw_info;
120 
124 class KalmarAsyncOp {
125 public:
126  KalmarAsyncOp(KalmarQueue *xqueue, hcCommandKind xCommandKind) : queue(xqueue), commandKind(xCommandKind), seqNum(0) {}
127 
128  virtual ~KalmarAsyncOp() {}
129  virtual std::shared_future<void>* getFuture() { return nullptr; }
130  virtual void* getNativeHandle() { return nullptr;}
131 
137  virtual uint64_t getBeginTimestamp() { return 0L; }
138 
144  virtual uint64_t getEndTimestamp() { return 0L; }
145 
151  virtual uint64_t getTimestampFrequency() { return 0L; }
152 
158  virtual bool isReady() { return false; }
159 
165  virtual void setWaitMode(hcWaitMode mode) {}
166 
167  void setSeqNumFromQueue();
168  uint64_t getSeqNum () const { return seqNum;};
169 
170  hcCommandKind getCommandKind() const { return commandKind; };
171  void setCommandKind(hcCommandKind xCommandKind) { commandKind = xCommandKind; };
172 
173  KalmarQueue *getQueue() const { return queue; };
174 
175 private:
176  KalmarQueue *queue;
177 
178  // Kind of this command - copy, kernel, barrier, etc:
179  hcCommandKind commandKind;
180 
181 
182  // Sequence number of this op in the queue it is dispatched into.
183  uint64_t seqNum;
184 
185 };
186 
190 class KalmarQueue
191 {
192 public:
193 
194  KalmarQueue(KalmarDevice* pDev, queuing_mode mode = queuing_mode_automatic, execute_order order = execute_in_order)
195  : pDev(pDev), mode(mode), order(order), opSeqNums(0) {}
196 
197  virtual ~KalmarQueue() {}
198 
199  virtual void flush() {}
200  virtual void wait(hcWaitMode mode = hcWaitModeBlocked) {}
201 
202  // sync kernel launch with dynamic group memory
203  virtual void LaunchKernelWithDynamicGroupMemory(void *kernel, size_t dim_ext, size_t *ext, size_t *local_size, size_t dynamic_group_size) {}
204 
205  // async kernel launch with dynamic group memory
206  virtual std::shared_ptr<KalmarAsyncOp> LaunchKernelWithDynamicGroupMemoryAsync(void *kernel, size_t dim_ext, size_t *ext, size_t *local_size, size_t dynamic_group_size) { return nullptr; }
207 
208  // sync kernel launch
209  virtual void LaunchKernel(void *kernel, size_t dim_ext, size_t *ext, size_t *local_size) {}
210 
211  // async kernel launch
212  virtual std::shared_ptr<KalmarAsyncOp> LaunchKernelAsync(void *kernel, size_t dim_ext, size_t *ext, size_t *local_size) { return LaunchKernelWithDynamicGroupMemoryAsync(kernel, dim_ext, ext, local_size, 0); }
213 
215  virtual void read(void* device, void* dst, size_t count, size_t offset) = 0;
216 
218  virtual void write(void* device, const void* src, size_t count, size_t offset, bool blocking) = 0;
219 
221  virtual void copy(void* src, void* dst, size_t count, size_t src_offset, size_t dst_offset, bool blocking) = 0;
222 
223 
224 
226  virtual void* map(void* device, size_t count, size_t offset, bool modify) = 0;
227 
229  virtual void unmap(void* device, void* addr, size_t count, size_t offset, bool modify) = 0;
230 
232  virtual void Push(void *kernel, int idx, void* device, bool modify) = 0;
233 
234  virtual uint32_t GetGroupSegmentSize(void *kernel) { return 0; }
235 
236  KalmarDevice* getDev() const { return pDev; }
237  queuing_mode get_mode() const { return mode; }
238  void set_mode(queuing_mode mod) { mode = mod; }
239 
240  execute_order get_execute_order() const { return order; }
241 
243  virtual int getPendingAsyncOps() { return 0; }
244 
246  virtual bool isEmpty() { return 0; }
247 
249  virtual void* getHSAQueue() { return nullptr; }
250 
252  virtual void* getHSAAgent() { return nullptr; }
253 
255  virtual void* getHSAAMRegion() { return nullptr; }
256 
257  virtual void* getHSAAMHostRegion() { return nullptr; }
258 
259  virtual void* getHSACoherentAMHostRegion() { return nullptr; }
260 
262  virtual void* getHSAKernargRegion() { return nullptr; }
263 
265  virtual bool hasHSAInterOp() { return false; }
266 
268  virtual std::shared_ptr<KalmarAsyncOp> EnqueueMarker(memory_scope) { return nullptr; }
269 
271  virtual std::shared_ptr<KalmarAsyncOp> EnqueueMarkerWithDependency(int count, std::shared_ptr <KalmarAsyncOp> *depOps, memory_scope scope) { return nullptr; }
272 
273  virtual std::shared_ptr<KalmarAsyncOp> detectStreamDeps(hcCommandKind commandKind, KalmarAsyncOp *newCopyOp) { return nullptr; };
274 
275 
277  virtual std::shared_ptr<KalmarAsyncOp> EnqueueAsyncCopy(const void* src, void* dst, size_t size_bytes) { return nullptr; }
278  virtual std::shared_ptr<KalmarAsyncOp> EnqueueAsyncCopyExt(const void* src, void* dst, size_t size_bytes,
279  hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo,
280  const Kalmar::KalmarDevice *copyDevice) { return nullptr; };
281 
282  // Copy src to dst synchronously
283  virtual void copy(const void *src, void *dst, size_t size_bytes) { }
284 
287  virtual void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo, bool forceUnpinnedCopy) { };
288  virtual void copy_ext(const void *src, void *dst, size_t size_bytes, hcCommandKind copyDir, const hc::AmPointerInfo &srcInfo, const hc::AmPointerInfo &dstInfo,
289  const Kalmar::KalmarDevice *copyDev, bool forceUnpinnedCopy) { };
290 
295  virtual void dispose() {}
296 
297  virtual void dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql,
298  const void * args, size_t argsize,
299  hc::completion_future *cf, const char *kernel_name) { };
300 
304  virtual bool set_cu_mask(const std::vector<bool>& cu_mask) { return false; };
305 
306 
307  uint64_t assign_op_seq_num() { return ++opSeqNums; };
308 
309 private:
310  KalmarDevice* pDev;
311  queuing_mode mode;
312  execute_order order;
313 
314  uint64_t opSeqNums; // last seqnum assigned to an op in this queue
315 };
316 
320 class KalmarDevice
321 {
322 private:
323  access_type cpu_type;
324 
325  // Set true if the device has large bar
326 
327 #if !TLS_QUEUE
328  std::shared_ptr<KalmarQueue> def;
331  std::once_flag flag;
332 #else
333  std::map< std::thread::id, std::shared_ptr<KalmarQueue> > tlsDefaultQueueMap;
336  std::mutex tlsDefaultQueueMap_mutex;
337 #endif
338 
339 protected:
340  // True if the device memory is mapped into CPU address space and can be
341  // directly accessed with CPU memory operations.
342  bool cpu_accessible_am;
343 
344 
345  KalmarDevice(access_type type = access_type_read_write)
346  : cpu_type(type),
347 #if !TLS_QUEUE
348  def(), flag()
349 #else
350  tlsDefaultQueueMap(), tlsDefaultQueueMap_mutex()
351 #endif
352  {}
353 public:
354  access_type get_access() const { return cpu_type; }
355  void set_access(access_type type) { cpu_type = type; }
356 
357  virtual std::wstring get_path() const = 0;
358  virtual std::wstring get_description() const = 0;
359  virtual size_t get_mem() const = 0;
360  virtual bool is_double() const = 0;
361  virtual bool is_lim_double() const = 0;
362  virtual bool is_unified() const = 0;
363  virtual bool is_emulated() const = 0;
364  virtual uint32_t get_version() const = 0;
365 
368  // key can used to avoid duplicate allocation
369  virtual void* create(size_t count, struct rw_info* key) = 0;
370 
373  virtual void release(void* ptr, struct rw_info* key) = 0;
374 
376  virtual void BuildProgram(void* size, void* source) {}
377 
379  virtual void* CreateKernel(const char* fun, KalmarQueue *queue) { return nullptr; }
380 
382  virtual bool IsCompatibleKernel(void* size, void* source) { return true; }
383 
385  virtual bool check(size_t* size, size_t dim_ext) { return true; }
386 
388  virtual std::shared_ptr<KalmarQueue> createQueue(execute_order order = execute_in_order) = 0;
389  virtual ~KalmarDevice() {}
390 
391  std::shared_ptr<KalmarQueue> get_default_queue() {
392 #if !TLS_QUEUE
393  std::call_once(flag, [&]() {
394  def = createQueue();
395  });
396  return def;
397 #else
398  std::thread::id tid = std::this_thread::get_id();
399  tlsDefaultQueueMap_mutex.lock();
400  if (tlsDefaultQueueMap.find(tid) == tlsDefaultQueueMap.end()) {
401  tlsDefaultQueueMap[tid] = createQueue();
402  }
403  std::shared_ptr<KalmarQueue> result = tlsDefaultQueueMap[tid];
404  tlsDefaultQueueMap_mutex.unlock();
405  return result;
406 #endif
407  }
408 
410  virtual size_t GetMaxTileStaticSize() { return 0; }
411 
413  virtual std::vector< std::shared_ptr<KalmarQueue> > get_all_queues() { return std::vector< std::shared_ptr<KalmarQueue> >(); }
414 
415  virtual void memcpySymbol(const char* symbolName, void* hostptr, size_t count, size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {}
416 
417  virtual void memcpySymbol(void* symbolAddr, void* hostptr, size_t count, size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {}
418 
419  virtual void* getSymbolAddress(const char* symbolName) { return nullptr; }
420 
422  virtual void* getHSAAgent() { return nullptr; }
423 
425  virtual hcAgentProfile getProfile() { return hcAgentProfileNone; }
426 
428  virtual bool is_peer(const KalmarDevice* other) {return false;}
429 
431  virtual unsigned int get_compute_unit_count() {return 0;}
432 
433  virtual int get_seqnum() const {return -1;}
434 
435  virtual bool has_cpu_accessible_am() {return false;}
436 
437 };
438 
439 class CPUQueue final : public KalmarQueue
440 {
441 public:
442 
443  CPUQueue(KalmarDevice* pDev) : KalmarQueue(pDev) {}
444 
445  void read(void* device, void* dst, size_t count, size_t offset) override {
446  if (dst != device)
447  memmove(dst, (char*)device + offset, count);
448  }
449 
450  void write(void* device, const void* src, size_t count, size_t offset, bool blocking) override {
451  if (src != device)
452  memmove((char*)device + offset, src, count);
453  }
454 
455  void copy(void* src, void* dst, size_t count, size_t src_offset, size_t dst_offset, bool blocking) override {
456  if (src != dst)
457  memmove((char*)dst + dst_offset, (char*)src + src_offset, count);
458  }
459 
460  void* map(void* device, size_t count, size_t offset, bool modify) override {
461  return (char*)device + offset;
462  }
463 
464  void unmap(void* device, void* addr, size_t count, size_t offset, bool modify) override {}
465 
466  void Push(void *kernel, int idx, void* device, bool modify) override {}
467 };
468 
470 class CPUDevice final : public KalmarDevice
471 {
472 public:
473  std::wstring get_path() const override { return L"cpu"; }
474  std::wstring get_description() const override { return L"CPU Device"; }
475  size_t get_mem() const override { return 0; }
476  bool is_double() const override { return true; }
477  bool is_lim_double() const override { return true; }
478  bool is_unified() const override { return true; }
479  bool is_emulated() const override { return true; }
480  uint32_t get_version() const override { return 0; }
481 
482  std::shared_ptr<KalmarQueue> createQueue(execute_order order = execute_in_order) override { return std::shared_ptr<KalmarQueue>(new CPUQueue(this)); }
483  void* create(size_t count, struct rw_info* /* not used */ ) override { return kalmar_aligned_alloc(0x1000, count); }
484  void release(void* ptr, struct rw_info* /* nout used */) override { kalmar_aligned_free(ptr); }
485  void* CreateKernel(const char* fun, KalmarQueue *queue) { return nullptr; }
486 };
487 
491 class KalmarContext
492 {
493 private:
494  //TODO: Think about a system which has multiple CPU socket, e.g. server. In this case,
495  //We might be able to assume that only the first device is CPU, or we only mimic one cpu
496  //device when constructing KalmarContext.
497  KalmarDevice* get_default_dev() {
498  if (!def) {
499  if (Devices.size() <= 1) {
500  fprintf(stderr, "There is no device can be used to do the computation\n");
501  exit(-1);
502  }
503  def = Devices[1];
504  }
505  return def;
506  }
507 protected:
509  KalmarDevice* def;
510  std::vector<KalmarDevice*> Devices;
511  KalmarContext() : def(nullptr), Devices() { Devices.push_back(new CPUDevice); }
512 
513  bool init_success = false;
514 
515 public:
516  virtual ~KalmarContext() {}
517 
518  std::vector<KalmarDevice*> getDevices() { return Devices; }
519 
521  bool set_default(const std::wstring& path) {
522  auto result = std::find_if(std::begin(Devices), std::end(Devices),
523  [&] (const KalmarDevice* pDev)
524  { return pDev->get_path() == path; });
525  if (result == std::end(Devices))
526  return false;
527  else {
528  def = *result;
529  return true;
530  }
531  }
532 
534  std::shared_ptr<KalmarQueue> auto_select() {
535  return get_default_dev()->get_default_queue();
536  }
537 
539  KalmarDevice* getDevice(std::wstring path = L"") {
540  if (path == L"default" || path == L"")
541  return get_default_dev();
542  auto result = std::find_if(std::begin(Devices), std::end(Devices),
543  [&] (const KalmarDevice* dev)
544  { return dev->get_path() == path; });
545  if (result != std::end(Devices))
546  return *result;
547  else
548  return get_default_dev();
549  }
550 
552  virtual uint64_t getSystemTicks() { return 0L; };
553 
555  virtual uint64_t getSystemTickFrequency() { return 0L; };
556 
557  // initialize the printf buffer
558  virtual void initPrintfBuffer() {};
559 
560  // flush the device printf buffer
561  virtual void flushPrintfBuffer() {};
562 
563  // get the locked printf buffer VA
564  virtual void* getPrintfBufferPointerVA() { return nullptr; };
565 };
566 
567 KalmarContext *getContext();
568 
569 namespace CLAMP {
570 // used in parallel_for_each.h
571 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
572 extern bool is_cpu();
573 extern bool in_cpu_kernel();
574 extern void enter_kernel();
575 extern void leave_kernel();
576 #endif
577 
578 extern void *CreateKernel(std::string, KalmarQueue*);
579 
580 extern void PushArg(void *, int, size_t, const void *);
581 extern void PushArgPtr(void *, int, size_t, const void *);
582 
583 } // namespace CLAMP
584 
585 static inline const std::shared_ptr<KalmarQueue> get_cpu_queue() {
586  static auto cpu_queue = getContext()->getDevice(L"cpu")->get_default_queue();
587  return cpu_queue;
588 }
589 
590 static inline bool is_cpu_queue(const std::shared_ptr<KalmarQueue>& Queue) {
591  return Queue->getDev()->get_path() == L"cpu";
592 }
593 
594 static inline void copy_helper(std::shared_ptr<KalmarQueue>& srcQueue, void* src,
595  std::shared_ptr<KalmarQueue>& dstQueue, void* dst,
596  size_t cnt, bool block,
597  size_t src_offset = 0, size_t dst_offset = 0) {
600  if (src == dst)
601  return ;
605 
606  if (is_cpu_queue(dstQueue))
607  srcQueue->read(src, (char*)dst + dst_offset, cnt, src_offset);
608  else
609  dstQueue->write(dst, (char*)src + src_offset, cnt, dst_offset, block);
610 }
611 
615 enum states
616 {
618  modified,
620  shared,
621  // not able to read and write
622  invalid
623 };
624 
631 struct dev_info
632 {
633  void* data;
634  states state;
635 };
636 
646 struct rw_info
647 {
652  void *data;
653  const size_t count;
655  std::shared_ptr<KalmarQueue> curr;
658  std::shared_ptr<KalmarQueue> master;
660  std::shared_ptr<KalmarQueue> stage;
664  std::map<KalmarDevice*, dev_info> devs;
665  access_type mode;
668  unsigned int HostPtr : 1;
669 
673  bool toReleaseDevPointer;
674 
675 
681  rw_info(const size_t count, void* ptr)
682  : data(ptr), count(count), curr(nullptr), master(nullptr), stage(nullptr),
683  devs(), mode(access_type_none), HostPtr(ptr != nullptr), toReleaseDevPointer(true) {
684 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
685  if (CLAMP::in_cpu_kernel() && ptr == nullptr) {
688  data = kalmar_aligned_alloc(0x1000, count);
689  return;
690  }
691 #endif
692  if (ptr) {
693  mode = access_type_read_write;
694  curr = master = get_cpu_queue();
695  devs[curr->getDev()] = {ptr, modified};
696  }
697  }
698 
705  rw_info(const std::shared_ptr<KalmarQueue>& Queue, const std::shared_ptr<KalmarQueue>& Stage,
706  const size_t count, access_type mode_) : data(nullptr), count(count),
707  curr(Queue), master(Queue), stage(nullptr), devs(), mode(mode_), HostPtr(false), toReleaseDevPointer(true) {
708 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
709  if (CLAMP::in_cpu_kernel() && data == nullptr) {
710  data = kalmar_aligned_alloc(0x1000, count);
711  return;
712  }
713 #endif
714  if (mode == access_type_auto)
715  mode = curr->getDev()->get_access();
716  devs[curr->getDev()] = {curr->getDev()->create(count, this), modified};
717 
719  if (is_cpu_queue(curr) || (curr->getDev()->is_unified() && mode != access_type_none))
720  data = devs[curr->getDev()].data;
721  if (is_cpu_queue(curr)) {
722  stage = Stage;
723  if (Stage != curr)
724  devs[stage->getDev()] = {stage->getDev()->create(count, this), invalid};
725  } else
727  stage = curr;
728  }
729 
735  rw_info(const std::shared_ptr<KalmarQueue>& Queue, const std::shared_ptr<KalmarQueue>& Stage,
736  const size_t count,
737  void* device_pointer,
738  access_type mode_) : data(nullptr), count(count), curr(Queue), master(Queue), stage(nullptr), devs(), mode(mode_), HostPtr(false), toReleaseDevPointer(false) {
739  if (mode == access_type_auto)
740  mode = curr->getDev()->get_access();
741  devs[curr->getDev()] = { device_pointer, modified };
742 
744  if (is_cpu_queue(curr) || (curr->getDev()->is_unified() && mode != access_type_none))
745  data = devs[curr->getDev()].data;
746  if (is_cpu_queue(curr)) {
747  stage = Stage;
748  if (Stage != curr)
749  devs[stage->getDev()] = {stage->getDev()->create(count, this), invalid};
750  } else
752  stage = curr;
753  }
754 
755  void* get_device_pointer() {
756  return devs[curr->getDev()].data;
757  }
758 
759  void construct(std::shared_ptr<KalmarQueue> pQueue) {
760  curr = pQueue;
761  devs[pQueue->getDev()] = {pQueue->getDev()->create(count, this), invalid};
762  if (is_cpu_queue(pQueue))
763  data = devs[pQueue->getDev()].data;
764  }
765 
766  void disc() {
767  for (auto& it : devs)
768  it.second.state = invalid;
769  }
770 
777  void try_switch_to_cpu() {
778  if (is_cpu_queue(curr))
779  return;
780  auto cpu_queue = get_cpu_queue();
781  if (devs.find(cpu_queue->getDev()) != std::end(devs))
782  if (devs[cpu_queue->getDev()].state == shared)
783  curr = cpu_queue;
784  }
785 
791  void sync(std::shared_ptr<KalmarQueue> pQueue, bool modify, bool block = true) {
792 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
793  if (CLAMP::in_cpu_kernel())
794  return;
795 #endif
796  if (!curr) {
799  dev_info dev = {pQueue->getDev()->create(count, this),
800  modify ? modified : shared};
801  devs[pQueue->getDev()] = dev;
802  if (is_cpu_queue(pQueue))
803  data = dev.data;
804  curr = pQueue;
805  return;
806  }
807 
808  if (curr == pQueue)
809  return;
810 
812  if (curr->getDev() == pQueue->getDev()) {
813  // curr->wait();
814  curr = pQueue;
815  if (modify) {
816  disc();
817  devs[curr->getDev()].state = modified;
818  }
819  return;
820  }
821 
823  if (devs.find(pQueue->getDev()) == std::end(devs)) {
824  dev_info dev = {pQueue->getDev()->create(count, this), invalid};
825  devs[pQueue->getDev()] = dev;
826  if (is_cpu_queue(pQueue))
827  data = dev.data;
828  }
829 
830  try_switch_to_cpu();
831  dev_info& dst = devs[pQueue->getDev()];
832  dev_info& src = devs[curr->getDev()];
833  if (dst.state == invalid && src.state != invalid)
834  copy_helper(curr, src.data, pQueue, dst.data, count, block);
837  curr = pQueue;
838  if (modify) {
839  disc();
840  dst.state = modified;
841  } else {
842  dst.state = shared;
843  if (src.state == modified)
844  src.state = shared;
845  }
846  }
847 
852  void* map(size_t cnt, size_t offset, bool modify) {
853  if (cnt == 0)
854  cnt = count;
857  if (!curr) {
858  curr = getContext()->auto_select();
859  devs[curr->getDev()] = {curr->getDev()->create(count, this), modify ? modified : shared};
860  return curr->map(data, cnt, offset, modify);
861  }
862  try_switch_to_cpu();
863  dev_info& info = devs[curr->getDev()];
864  if (info.state == shared && modify) {
865  disc();
866  info.state = modified;
867  }
868  return curr->map(info.data, cnt, offset, modify);
869  }
870 
871  void unmap(void* addr, size_t cnt, size_t offset, bool modify) { curr->unmap(devs[curr->getDev()].data, addr, cnt, offset, modify); }
872 
876  void synchronize(bool modify) { sync(master, modify); }
877 
880  void get_cpu_access(bool modify) { sync(get_cpu_queue(), modify); }
881 
884  void write(const void* src, int cnt, int offset, bool blocking) {
885  curr->write(devs[curr->getDev()].data, src, cnt, offset, blocking);
886  dev_info& dev = devs[curr->getDev()];
887  if (dev.state != modified) {
888  disc();
889  dev.state = modified;
890  }
891  }
892 
894  void read(void* dst, int cnt, int offset) {
895  curr->read(devs[curr->getDev()].data, dst, cnt, offset);
896  }
897 
899  void copy(rw_info* other, int src_offset, int dst_offset, int cnt) {
900  if (cnt == 0)
901  cnt = count;
902  if (!curr) {
903  if (!other->curr)
904  return;
905  else
906  construct(other->curr);
907  } else {
908  if (!other->curr)
909  other->construct(curr);
910  }
911  dev_info& dst = other->devs[other->curr->getDev()];
912  dev_info& src = devs[curr->getDev()];
914  if (src.state == invalid) {
915  src.state = shared;
916  if (is_cpu_queue(curr))
917  memset((char*)src.data + src_offset, 0, cnt);
918  else {
919  void *ptr = kalmar_aligned_alloc(0x1000, cnt);
920  memset(ptr, 0, cnt);
921  curr->write(src.data, ptr, cnt, src_offset, true);
922  kalmar_aligned_free(ptr);
923  }
924  }
925  copy_helper(curr, src.data, other->curr, dst.data, cnt, true, src_offset, dst_offset);
926  other->disc();
927  dst.state = modified;
928  }
929 
930  ~rw_info() {
931 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2
932  if (CLAMP::in_cpu_kernel()) {
933  if (data && !HostPtr)
934  kalmar_aligned_free(data);
935  return;
936  }
937 #endif
938 
942  if (HostPtr)
943  synchronize(false);
944  if (curr) {
945  // Wait issues a system-scope release:
946  // Need to make sure we write-back cache contents before deallocating the memory those writes might eventually touch
947  curr->wait();
948  }
949  auto cpu_dev = get_cpu_queue()->getDev();
950  if (devs.find(cpu_dev) != std::end(devs)) {
951  if (!HostPtr)
952  cpu_dev->release(devs[cpu_dev].data, this);
953  devs.erase(cpu_dev);
954  }
955  KalmarDevice* pDev;
956  dev_info info;
957  for (const auto it : devs) {
958  std::tie(pDev, info) = it;
959  if (toReleaseDevPointer)
960  pDev->release(info.data, this);
961  }
962  }
963 };
964 
965 
966 //--- Implementation:
967 //
968 
969 inline void KalmarAsyncOp::setSeqNumFromQueue() { seqNum = queue->assign_op_seq_num(); };
970 
971 } // namespace Kalmar
972 
This class is the return type of all asynchronous APIs and has an interface analogous to std::shared_...
Definition: hc.hpp:1130
namespace for internal classes of Kalmar compiler / runtime
Definition: hc.hpp:42
void copy(const array_view< const T, N > &src, const array_view< T, N > &dest)
The contents of "src" are copied into "dest".
Definition: hc.hpp:6631
Heterogeneous C++ (HC) namespace.
Definition: grid_launch.h:10
Definition: hc_am.hpp:21
Definition: kalmar_runtime.h:14