3 #include "hc_defines.h" 4 #include "kalmar_aligned_alloc.h" 8 class completion_future;
11 typedef struct hsa_kernel_dispatch_packet_s hsa_kernel_dispatch_packet_t;
22 access_type_read = (1 << 0),
23 access_type_write = (1 << 1),
24 access_type_read_write = access_type_read | access_type_write,
25 access_type_auto = (1 << 31)
30 queuing_mode_immediate,
31 queuing_mode_automatic
49 static inline memory_scope greater_scope(memory_scope scope1, memory_scope scope2)
51 if ((scope1==system_scope) || (scope2 == system_scope)) {
53 }
else if ((scope1==accelerator_scope) || (scope2 == accelerator_scope)) {
54 return accelerator_scope;
64 hcMemcpyHostToHost = 0,
65 hcMemcpyHostToDevice = 1,
66 hcMemcpyDeviceToHost = 2,
67 hcMemcpyDeviceToDevice = 3,
74 static inline bool isCopyCommand(hcCommandKind k)
77 case hcMemcpyHostToHost:
78 case hcMemcpyHostToDevice:
79 case hcMemcpyDeviceToHost:
80 case hcMemcpyDeviceToDevice:
89 static inline bool isComputeQueueCommand(hcCommandKind k) {
90 return (k == hcCommandKernel) || (k == hcCommandMarker);
97 hcWaitModeBlocked = 0,
101 enum hcAgentProfile {
102 hcAgentProfileNone = 0,
103 hcAgentProfileBase = 1,
104 hcAgentProfileFull = 2
124 class KalmarAsyncOp {
126 KalmarAsyncOp(KalmarQueue *xqueue, hcCommandKind xCommandKind) : queue(xqueue), commandKind(xCommandKind), seqNum(0) {}
128 virtual ~KalmarAsyncOp() {}
129 virtual std::shared_future<void>* getFuture() {
return nullptr; }
130 virtual void* getNativeHandle() {
return nullptr;}
137 virtual uint64_t getBeginTimestamp() {
return 0L; }
144 virtual uint64_t getEndTimestamp() {
return 0L; }
151 virtual uint64_t getTimestampFrequency() {
return 0L; }
158 virtual bool isReady() {
return false; }
165 virtual void setWaitMode(hcWaitMode mode) {}
167 void setSeqNumFromQueue();
168 uint64_t getSeqNum ()
const {
return seqNum;};
170 hcCommandKind getCommandKind()
const {
return commandKind; };
171 void setCommandKind(hcCommandKind xCommandKind) { commandKind = xCommandKind; };
173 KalmarQueue *getQueue()
const {
return queue; };
179 hcCommandKind commandKind;
194 KalmarQueue(KalmarDevice* pDev, queuing_mode mode = queuing_mode_automatic, execute_order order = execute_in_order)
195 : pDev(pDev), mode(mode), order(order), opSeqNums(0) {}
197 virtual ~KalmarQueue() {}
199 virtual void flush() {}
200 virtual void wait(hcWaitMode mode = hcWaitModeBlocked) {}
203 virtual void LaunchKernelWithDynamicGroupMemory(
void *kernel,
size_t dim_ext,
size_t *ext,
size_t *local_size,
size_t dynamic_group_size) {}
206 virtual std::shared_ptr<KalmarAsyncOp> LaunchKernelWithDynamicGroupMemoryAsync(
void *kernel,
size_t dim_ext,
size_t *ext,
size_t *local_size,
size_t dynamic_group_size) {
return nullptr; }
209 virtual void LaunchKernel(
void *kernel,
size_t dim_ext,
size_t *ext,
size_t *local_size) {}
212 virtual std::shared_ptr<KalmarAsyncOp> LaunchKernelAsync(
void *kernel,
size_t dim_ext,
size_t *ext,
size_t *local_size) {
return LaunchKernelWithDynamicGroupMemoryAsync(kernel, dim_ext, ext, local_size, 0); }
215 virtual void read(
void* device,
void* dst,
size_t count,
size_t offset) = 0;
218 virtual void write(
void* device,
const void* src,
size_t count,
size_t offset,
bool blocking) = 0;
221 virtual void copy(
void* src,
void* dst,
size_t count,
size_t src_offset,
size_t dst_offset,
bool blocking) = 0;
226 virtual void* map(
void* device,
size_t count,
size_t offset,
bool modify) = 0;
229 virtual void unmap(
void* device,
void* addr,
size_t count,
size_t offset,
bool modify) = 0;
232 virtual void Push(
void *kernel,
int idx,
void* device,
bool modify) = 0;
234 virtual uint32_t GetGroupSegmentSize(
void *kernel) {
return 0; }
236 KalmarDevice* getDev()
const {
return pDev; }
237 queuing_mode get_mode()
const {
return mode; }
238 void set_mode(queuing_mode mod) { mode = mod; }
240 execute_order get_execute_order()
const {
return order; }
243 virtual int getPendingAsyncOps() {
return 0; }
246 virtual bool isEmpty() {
return 0; }
249 virtual void* getHSAQueue() {
return nullptr; }
252 virtual void* getHSAAgent() {
return nullptr; }
255 virtual void* getHSAAMRegion() {
return nullptr; }
257 virtual void* getHSAAMHostRegion() {
return nullptr; }
259 virtual void* getHSACoherentAMHostRegion() {
return nullptr; }
262 virtual void* getHSAKernargRegion() {
return nullptr; }
265 virtual bool hasHSAInterOp() {
return false; }
268 virtual std::shared_ptr<KalmarAsyncOp> EnqueueMarker(memory_scope) {
return nullptr; }
271 virtual std::shared_ptr<KalmarAsyncOp> EnqueueMarkerWithDependency(
int count, std::shared_ptr <KalmarAsyncOp> *depOps, memory_scope scope) {
return nullptr; }
273 virtual std::shared_ptr<KalmarAsyncOp> detectStreamDeps(hcCommandKind commandKind, KalmarAsyncOp *newCopyOp) {
return nullptr; };
277 virtual std::shared_ptr<KalmarAsyncOp> EnqueueAsyncCopy(
const void* src,
void* dst,
size_t size_bytes) {
return nullptr; }
278 virtual std::shared_ptr<KalmarAsyncOp> EnqueueAsyncCopyExt(
const void* src,
void* dst,
size_t size_bytes,
280 const Kalmar::KalmarDevice *copyDevice) {
return nullptr; };
283 virtual void copy(
const void *src,
void *dst,
size_t size_bytes) { }
287 virtual void copy_ext(
const void *src,
void *dst,
size_t size_bytes, hcCommandKind copyDir,
const hc::AmPointerInfo &srcInfo,
const hc::AmPointerInfo &dstInfo,
bool forceUnpinnedCopy) { };
289 const Kalmar::KalmarDevice *copyDev,
bool forceUnpinnedCopy) { };
295 virtual void dispose() {}
297 virtual void dispatch_hsa_kernel(
const hsa_kernel_dispatch_packet_t *aql,
298 const void * args,
size_t argsize,
304 virtual bool set_cu_mask(
const std::vector<bool>& cu_mask) {
return false; };
307 uint64_t assign_op_seq_num() {
return ++opSeqNums; };
323 access_type cpu_type;
328 std::shared_ptr<KalmarQueue> def;
333 std::map< std::thread::id, std::shared_ptr<KalmarQueue> > tlsDefaultQueueMap;
336 std::mutex tlsDefaultQueueMap_mutex;
342 bool cpu_accessible_am;
345 KalmarDevice(access_type type = access_type_read_write)
350 tlsDefaultQueueMap(), tlsDefaultQueueMap_mutex()
354 access_type get_access()
const {
return cpu_type; }
355 void set_access(access_type type) { cpu_type = type; }
357 virtual std::wstring get_path()
const = 0;
358 virtual std::wstring get_description()
const = 0;
359 virtual size_t get_mem()
const = 0;
360 virtual bool is_double()
const = 0;
361 virtual bool is_lim_double()
const = 0;
362 virtual bool is_unified()
const = 0;
363 virtual bool is_emulated()
const = 0;
364 virtual uint32_t get_version()
const = 0;
369 virtual void* create(
size_t count,
struct rw_info* key) = 0;
373 virtual void release(
void* ptr,
struct rw_info* key) = 0;
376 virtual void BuildProgram(
void* size,
void* source) {}
379 virtual void* CreateKernel(
const char* fun, KalmarQueue *queue) {
return nullptr; }
382 virtual bool IsCompatibleKernel(
void* size,
void* source) {
return true; }
385 virtual bool check(
size_t* size,
size_t dim_ext) {
return true; }
388 virtual std::shared_ptr<KalmarQueue> createQueue(execute_order order = execute_in_order) = 0;
389 virtual ~KalmarDevice() {}
391 std::shared_ptr<KalmarQueue> get_default_queue() {
393 std::call_once(flag, [&]() {
398 std::thread::id tid = std::this_thread::get_id();
399 tlsDefaultQueueMap_mutex.lock();
400 if (tlsDefaultQueueMap.find(tid) == tlsDefaultQueueMap.end()) {
401 tlsDefaultQueueMap[tid] = createQueue();
403 std::shared_ptr<KalmarQueue> result = tlsDefaultQueueMap[tid];
404 tlsDefaultQueueMap_mutex.unlock();
410 virtual size_t GetMaxTileStaticSize() {
return 0; }
413 virtual std::vector< std::shared_ptr<KalmarQueue> > get_all_queues() {
return std::vector< std::shared_ptr<KalmarQueue> >(); }
415 virtual void memcpySymbol(
const char* symbolName,
void* hostptr,
size_t count,
size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {}
417 virtual void memcpySymbol(
void* symbolAddr,
void* hostptr,
size_t count,
size_t offset = 0, hcCommandKind kind = hcMemcpyHostToDevice) {}
419 virtual void* getSymbolAddress(
const char* symbolName) {
return nullptr; }
422 virtual void* getHSAAgent() {
return nullptr; }
425 virtual hcAgentProfile getProfile() {
return hcAgentProfileNone; }
428 virtual bool is_peer(
const KalmarDevice* other) {
return false;}
431 virtual unsigned int get_compute_unit_count() {
return 0;}
433 virtual int get_seqnum()
const {
return -1;}
435 virtual bool has_cpu_accessible_am() {
return false;}
439 class CPUQueue final :
public KalmarQueue
443 CPUQueue(KalmarDevice* pDev) : KalmarQueue(pDev) {}
445 void read(
void* device,
void* dst,
size_t count,
size_t offset)
override {
447 memmove(dst, (
char*)device + offset, count);
450 void write(
void* device,
const void* src,
size_t count,
size_t offset,
bool blocking)
override {
452 memmove((
char*)device + offset, src, count);
455 void copy(
void* src,
void* dst,
size_t count,
size_t src_offset,
size_t dst_offset,
bool blocking)
override {
457 memmove((
char*)dst + dst_offset, (
char*)src + src_offset, count);
460 void* map(
void* device,
size_t count,
size_t offset,
bool modify)
override {
461 return (
char*)device + offset;
464 void unmap(
void* device,
void* addr,
size_t count,
size_t offset,
bool modify)
override {}
466 void Push(
void *kernel,
int idx,
void* device,
bool modify)
override {}
470 class CPUDevice final :
public KalmarDevice
473 std::wstring get_path()
const override {
return L
"cpu"; }
474 std::wstring get_description()
const override {
return L
"CPU Device"; }
475 size_t get_mem()
const override {
return 0; }
476 bool is_double()
const override {
return true; }
477 bool is_lim_double()
const override {
return true; }
478 bool is_unified()
const override {
return true; }
479 bool is_emulated()
const override {
return true; }
480 uint32_t get_version()
const override {
return 0; }
482 std::shared_ptr<KalmarQueue> createQueue(execute_order order = execute_in_order)
override {
return std::shared_ptr<KalmarQueue>(
new CPUQueue(
this)); }
483 void* create(
size_t count,
struct rw_info* )
override {
return kalmar_aligned_alloc(0x1000, count); }
484 void release(
void* ptr,
struct rw_info* )
override { kalmar_aligned_free(ptr); }
485 void* CreateKernel(
const char* fun, KalmarQueue *queue) {
return nullptr; }
497 KalmarDevice* get_default_dev() {
499 if (Devices.size() <= 1) {
500 fprintf(stderr,
"There is no device can be used to do the computation\n");
510 std::vector<KalmarDevice*> Devices;
511 KalmarContext() : def(nullptr), Devices() { Devices.push_back(
new CPUDevice); }
513 bool init_success =
false;
516 virtual ~KalmarContext() {}
518 std::vector<KalmarDevice*> getDevices() {
return Devices; }
521 bool set_default(
const std::wstring& path) {
522 auto result = std::find_if(std::begin(Devices), std::end(Devices),
523 [&] (
const KalmarDevice* pDev)
524 {
return pDev->get_path() == path; });
525 if (result == std::end(Devices))
534 std::shared_ptr<KalmarQueue> auto_select() {
535 return get_default_dev()->get_default_queue();
539 KalmarDevice* getDevice(std::wstring path = L
"") {
540 if (path == L
"default" || path == L
"")
541 return get_default_dev();
542 auto result = std::find_if(std::begin(Devices), std::end(Devices),
543 [&] (
const KalmarDevice* dev)
544 {
return dev->get_path() == path; });
545 if (result != std::end(Devices))
548 return get_default_dev();
552 virtual uint64_t getSystemTicks() {
return 0L; };
555 virtual uint64_t getSystemTickFrequency() {
return 0L; };
558 virtual void initPrintfBuffer() {};
561 virtual void flushPrintfBuffer() {};
564 virtual void* getPrintfBufferPointerVA() {
return nullptr; };
567 KalmarContext *getContext();
571 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 572 extern bool is_cpu();
573 extern bool in_cpu_kernel();
574 extern void enter_kernel();
575 extern void leave_kernel();
578 extern void *CreateKernel(std::string, KalmarQueue*);
580 extern void PushArg(
void *,
int,
size_t,
const void *);
581 extern void PushArgPtr(
void *,
int,
size_t,
const void *);
585 static inline const std::shared_ptr<KalmarQueue> get_cpu_queue() {
586 static auto cpu_queue = getContext()->getDevice(L
"cpu")->get_default_queue();
590 static inline bool is_cpu_queue(
const std::shared_ptr<KalmarQueue>& Queue) {
591 return Queue->getDev()->get_path() == L
"cpu";
594 static inline void copy_helper(std::shared_ptr<KalmarQueue>& srcQueue,
void* src,
595 std::shared_ptr<KalmarQueue>& dstQueue,
void* dst,
596 size_t cnt,
bool block,
597 size_t src_offset = 0,
size_t dst_offset = 0) {
606 if (is_cpu_queue(dstQueue))
607 srcQueue->read(src, (
char*)dst + dst_offset, cnt, src_offset);
609 dstQueue->write(dst, (
char*)src + src_offset, cnt, dst_offset, block);
655 std::shared_ptr<KalmarQueue> curr;
658 std::shared_ptr<KalmarQueue> master;
660 std::shared_ptr<KalmarQueue> stage;
664 std::map<KalmarDevice*, dev_info> devs;
668 unsigned int HostPtr : 1;
673 bool toReleaseDevPointer;
681 rw_info(
const size_t count,
void* ptr)
682 : data(ptr), count(count), curr(nullptr), master(nullptr), stage(nullptr),
683 devs(), mode(access_type_none), HostPtr(ptr != nullptr), toReleaseDevPointer(true) {
684 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 685 if (CLAMP::in_cpu_kernel() && ptr ==
nullptr) {
688 data = kalmar_aligned_alloc(0x1000, count);
693 mode = access_type_read_write;
694 curr = master = get_cpu_queue();
695 devs[curr->getDev()] = {ptr, modified};
705 rw_info(
const std::shared_ptr<KalmarQueue>& Queue,
const std::shared_ptr<KalmarQueue>& Stage,
706 const size_t count, access_type mode_) : data(nullptr), count(count),
707 curr(Queue), master(Queue), stage(nullptr), devs(), mode(mode_), HostPtr(false), toReleaseDevPointer(true) {
708 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 709 if (CLAMP::in_cpu_kernel() && data ==
nullptr) {
710 data = kalmar_aligned_alloc(0x1000, count);
714 if (mode == access_type_auto)
715 mode = curr->getDev()->get_access();
716 devs[curr->getDev()] = {curr->getDev()->create(count,
this), modified};
719 if (is_cpu_queue(curr) || (curr->getDev()->is_unified() && mode != access_type_none))
720 data = devs[curr->getDev()].data;
721 if (is_cpu_queue(curr)) {
724 devs[stage->getDev()] = {stage->getDev()->create(count,
this), invalid};
735 rw_info(
const std::shared_ptr<KalmarQueue>& Queue,
const std::shared_ptr<KalmarQueue>& Stage,
737 void* device_pointer,
738 access_type mode_) : data(nullptr), count(count), curr(Queue), master(Queue), stage(nullptr), devs(), mode(mode_), HostPtr(false), toReleaseDevPointer(false) {
739 if (mode == access_type_auto)
740 mode = curr->getDev()->get_access();
741 devs[curr->getDev()] = { device_pointer, modified };
744 if (is_cpu_queue(curr) || (curr->getDev()->is_unified() && mode != access_type_none))
745 data = devs[curr->getDev()].data;
746 if (is_cpu_queue(curr)) {
749 devs[stage->getDev()] = {stage->getDev()->create(count,
this), invalid};
755 void* get_device_pointer() {
756 return devs[curr->getDev()].data;
759 void construct(std::shared_ptr<KalmarQueue> pQueue) {
761 devs[pQueue->getDev()] = {pQueue->getDev()->create(count,
this), invalid};
762 if (is_cpu_queue(pQueue))
763 data = devs[pQueue->getDev()].data;
767 for (
auto& it : devs)
768 it.second.state = invalid;
777 void try_switch_to_cpu() {
778 if (is_cpu_queue(curr))
780 auto cpu_queue = get_cpu_queue();
781 if (devs.find(cpu_queue->getDev()) != std::end(devs))
782 if (devs[cpu_queue->getDev()].state == shared)
791 void sync(std::shared_ptr<KalmarQueue> pQueue,
bool modify,
bool block =
true) {
792 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 793 if (CLAMP::in_cpu_kernel())
799 dev_info dev = {pQueue->getDev()->create(count,
this),
800 modify ? modified : shared};
801 devs[pQueue->getDev()] = dev;
802 if (is_cpu_queue(pQueue))
812 if (curr->getDev() == pQueue->getDev()) {
817 devs[curr->getDev()].state = modified;
823 if (devs.find(pQueue->getDev()) == std::end(devs)) {
824 dev_info dev = {pQueue->getDev()->create(count,
this), invalid};
825 devs[pQueue->getDev()] = dev;
826 if (is_cpu_queue(pQueue))
831 dev_info& dst = devs[pQueue->getDev()];
832 dev_info& src = devs[curr->getDev()];
833 if (dst.state == invalid && src.state != invalid)
834 copy_helper(curr, src.data, pQueue, dst.data, count, block);
840 dst.state = modified;
843 if (src.state == modified)
852 void* map(
size_t cnt,
size_t offset,
bool modify) {
858 curr = getContext()->auto_select();
859 devs[curr->getDev()] = {curr->getDev()->create(count,
this), modify ? modified : shared};
860 return curr->map(data, cnt, offset, modify);
863 dev_info& info = devs[curr->getDev()];
864 if (info.state == shared && modify) {
866 info.state = modified;
868 return curr->map(info.data, cnt, offset, modify);
871 void unmap(
void* addr,
size_t cnt,
size_t offset,
bool modify) { curr->unmap(devs[curr->getDev()].data, addr, cnt, offset, modify); }
876 void synchronize(
bool modify) { sync(master, modify); }
880 void get_cpu_access(
bool modify) { sync(get_cpu_queue(), modify); }
884 void write(
const void* src,
int cnt,
int offset,
bool blocking) {
885 curr->write(devs[curr->getDev()].data, src, cnt, offset, blocking);
886 dev_info& dev = devs[curr->getDev()];
887 if (dev.state != modified) {
889 dev.state = modified;
894 void read(
void* dst,
int cnt,
int offset) {
895 curr->read(devs[curr->getDev()].data, dst, cnt, offset);
899 void copy(rw_info* other,
int src_offset,
int dst_offset,
int cnt) {
906 construct(other->curr);
909 other->construct(curr);
911 dev_info& dst = other->devs[other->curr->getDev()];
912 dev_info& src = devs[curr->getDev()];
914 if (src.state == invalid) {
916 if (is_cpu_queue(curr))
917 memset((
char*)src.data + src_offset, 0, cnt);
919 void *ptr = kalmar_aligned_alloc(0x1000, cnt);
921 curr->write(src.data, ptr, cnt, src_offset,
true);
922 kalmar_aligned_free(ptr);
925 copy_helper(curr, src.data, other->curr, dst.data, cnt,
true, src_offset, dst_offset);
927 dst.state = modified;
931 #if __KALMAR_ACCELERATOR__ == 2 || __KALMAR_CPU__ == 2 932 if (CLAMP::in_cpu_kernel()) {
933 if (data && !HostPtr)
934 kalmar_aligned_free(data);
949 auto cpu_dev = get_cpu_queue()->getDev();
950 if (devs.find(cpu_dev) != std::end(devs)) {
952 cpu_dev->release(devs[cpu_dev].data,
this);
957 for (
const auto it : devs) {
958 std::tie(pDev, info) = it;
959 if (toReleaseDevPointer)
960 pDev->release(info.data,
this);
969 inline void KalmarAsyncOp::setSeqNumFromQueue() { seqNum = queue->assign_op_seq_num(); };
This class is the return type of all asynchronous APIs and has an interface analogous to std::shared_...
Definition: hc.hpp:1130
namespace for internal classes of Kalmar compiler / runtime
Definition: hc.hpp:42
void copy(const array_view< const T, N > &src, const array_view< T, N > &dest)
The contents of "src" are copied into "dest".
Definition: hc.hpp:6631
Heterogeneous C++ (HC) namespace.
Definition: grid_launch.h:10
Definition: kalmar_runtime.h:14