Realm
A distributed, event-based tasking library
Loading...
Searching...
No Matches
cuda_internal.h
Go to the documentation of this file.
1/*
2 * Copyright 2026 Stanford University, NVIDIA Corporation, Los Alamos National Laboratory
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef REALM_CUDA_INTERNAL_H
19#define REALM_CUDA_INTERNAL_H
20
22
23#include <memory>
24#include <unordered_map>
25#if !defined(CUDA_ENABLE_DEPRECATED)
26// Ignore deprecation warnings from cuda headers
27#define CUDA_ENABLE_DEPRECATED 1
28#endif
29#include <cuda.h>
30#include <nvml.h>
31#include <cupti.h>
32#if defined(REALM_USE_CUDART_HIJACK)
33#include <cuda_runtime_api.h> // For cudaDeviceProp
34#endif
35
36// For CUDA runtime's dim3 definition
37#include <vector_types.h>
38
39#include "realm/operation.h"
40#include "realm/threads.h"
41#include "realm/circ_queue.h"
42#include "realm/indexspace.h"
43#include "realm/proc_impl.h"
44#include "realm/mem_impl.h"
45#include "realm/bgwork.h"
49
50#if CUDART_VERSION < 11000
51#define CHECK_CUDART(cmd) \
52 do { \
53 int ret = (int)(cmd); \
54 if(ret != 0) { \
55 fprintf(stderr, "CUDART: %s = %d\n", #cmd, ret); \
56 assert(0); \
57 exit(1); \
58 } \
59 } while(0)
60#else
61// Since CUDA TK11.0, runtime and driver error codes are 1:1 correlated
62#define CHECK_CUDART(cmd) CHECK_CU((CUresult)(cmd))
63#endif
64
65// Need CUDA 6.5 or later for good error reporting
66#if CUDA_VERSION >= 6050
67#define REPORT_CU_ERROR(level, cmd, ret) \
68 do { \
69 const char *name, *str; \
70 CUDA_DRIVER_FNPTR(Realm::Cuda::cuGetErrorName)(ret, &name); \
71 CUDA_DRIVER_FNPTR(Realm::Cuda::cuGetErrorString)(ret, &str); \
72 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret \
73 << '(' << name << "): " << str; \
74 } while(0)
75#else
76#define REPORT_CU_ERROR(level, cmd, ret) \
77 do { \
78 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret \
79 } while(0)
80#endif
81
82#define CHECK_CU(cmd) \
83 do { \
84 CUresult ret = (cmd); \
85 if(ret != CUDA_SUCCESS) { \
86 REPORT_CU_ERROR(Logger::LEVEL_ERROR, #cmd, ret); \
87 abort(); \
88 } \
89 } while(0)
90
91#define REPORT_NVML_ERROR(level, cmd, ret) \
92 do { \
93 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret; \
94 } while(0)
95
96#define CHECK_NVML(cmd) \
97 do { \
98 nvmlReturn_t ret = (cmd); \
99 if(ret != NVML_SUCCESS) { \
100 REPORT_NVML_ERROR(Logger::LEVEL_ERROR, #cmd, ret); \
101 abort(); \
102 } \
103 } while(0)
104
105#define IS_DEFAULT_STREAM(stream) \
106 (((stream) == 0) || ((stream) == CU_STREAM_LEGACY) || \
107 ((stream) == CU_STREAM_PER_THREAD))
108
109#define REPORT_CUPTI_ERROR(level, cmd, ret) \
110 do { \
111 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret; \
112 } while(0)
113
114#define CHECK_CUPTI(cmd) \
115 do { \
116 CUptiResult ret = (cmd); \
117 if(ret != CUPTI_SUCCESS) { \
118 REPORT_CUPTI_ERROR(Logger::LEVEL_ERROR, #cmd, ret); \
119 abort(); \
120 } \
121 } while(0)
122
123namespace Realm {
124
125 namespace Cuda {
126
127 struct GPUInfo {
128 int index; // index used by CUDA runtime
129 CUdevice device;
130 nvmlDevice_t nvml_dev;
131 CUuuid uuid;
132 int major;
133 int minor;
134 static const size_t MAX_NAME_LEN = 256;
137 static const size_t MAX_NUMA_NODE_LEN = 20;
140 std::set<CUdevice> peers; // other GPUs we can do p2p copies with
144 size_t c2c_bandwidth = 0; // Current enabled c2c bandwidth
145 size_t pci_bandwidth = 0; // Current enabled pci-e bandwidth
146 size_t nvswitch_bandwidth = 0; // Current enabled nvswitch bandwidth
147 bool host_gpu_same_va = false;
148 std::vector<size_t> logical_peer_bandwidth;
149 std::vector<size_t> logical_peer_latency;
150 // Fabric information for this gpu
151 bool fabric_supported = false;
152 unsigned fabric_clique = -1U;
153 CUuuid fabric_uuid = {0};
155
156#ifdef REALM_USE_CUDART_HIJACK
157 cudaDeviceProp prop;
158#endif
159 };
160
168
169 // Forard declaration
170 class GPUProcessor;
171 class GPUWorker;
172 class GPUStream;
173 class GPUFBMemory;
174 class GPUDynamicFBMemory;
175 class GPUZCMemory;
176 class GPUFBIBMemory;
177 class GPUAllocation;
178 class GPU;
179 class CudaModule;
180
181 extern CudaModule *cuda_module_singleton;
182
184 public:
186 void *create_context(Task *task) const override;
187 void destroy_context(Task *task, void *context) const override;
188 void *create_context(InternalTask *task) const override;
189 void destroy_context(InternalTask *task, void *context) const override;
190 GPU *gpu = nullptr;
191 GPUProcessor *proc = nullptr; // TODO(cperry): delete me
192 };
193
194 // an interface for receiving completion notification for a GPU operation
195 // (right now, just copies)
197 public:
199
200 virtual void request_completed(void) = 0;
201 };
202
204 public:
207
208 virtual void mark_finished(bool successful);
209
210 virtual void request_cancellation(void);
211
213
214 virtual void print(std::ostream &os) const;
215
219 DummyLock>
221
222 protected:
223 static void cuda_callback(CUstream stream, CUresult res, void *data);
224 GPU *gpu = nullptr;
225 };
226
228 public:
230
231 virtual void request_cancellation(void) { return; };
232
234
235 virtual void print(std::ostream &os) const;
236
238
239 protected:
240 static void cuda_start_callback(CUstream stream, CUresult res, void *data);
241 };
242
243 // a class that represents a CUDA stream and work associated with
244 // it (e.g. queued copies, events in flight)
245 // a stream is also associated with a GPUWorker that it will register
246 // with when async work needs doing
247 class GPUStream {
248 public:
249 GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority = 0);
251
252 GPU *get_gpu(void) const;
254 get_stream(void) const; // needed by librealm_kokkos.so
255
256 // may be called by anybody to enqueue a copy or an event
260 void add_event(CUevent event, GPUWorkFence *fence,
261 GPUCompletionNotification *notification = NULL,
262 GPUWorkStart *start = NULL);
263 void wait_on_streams(const std::set<GPUStream *> &other_streams);
264
265 // atomically checks rate limit counters and returns true if 'bytes'
266 // worth of copies can be submitted or false if not (in which case
267 // the progress counter on the xd will be updated when it should try
268 // again)
269 bool ok_to_submit_copy(size_t bytes, XferDes *xd);
270 bool reap_events(TimeLimit work_until);
271
272 protected:
273 // may only be tested with lock held
274 bool has_work(void) const;
275
278
279 CUstream stream;
280
288#ifdef USE_CQ
290#else
291 std::deque<PendingEvent> pending_events;
292#endif
293 };
294
295 // a GPUWorker is responsible for making progress on one or more GPUStreams -
296 // this may be done directly by a GPUProcessor or in a background thread
297 // spawned for the purpose
299 public:
301 virtual ~GPUWorker(void);
302
303 // adds a stream that has work to be done
305
306 // used to start a dedicate thread (mutually exclusive with being
307 // registered with a background work manager)
310
311 bool do_work(TimeLimit work_until);
312
313 public:
314 void thread_main(void);
315
316 protected:
317 // used by the background thread
318 // processes work on streams, optionally sleeping for work to show up
319 // returns true if work remains to be done
320 bool process_streams(bool sleep_on_empty);
321
324
327
328 // used by the background thread (if any)
333 };
334
335 // a little helper class to manage a pool of CUevents that can be reused
336 // to reduce alloc/destroy overheads
338 public:
339 GPUEventPool(int _batch_size = 256);
340
341 // allocating the initial batch of events and cleaning up are done with
342 // these methods instead of constructor/destructor because we don't
343 // manage the GPU context in this helper class
344 void init_pool(int init_size = 0 /* default == batch size */);
345 void empty_pool(void);
346
347 CUevent get_event(bool external = false);
348 void return_event(CUevent e, bool external = false);
349
350 protected:
353 std::vector<CUevent> available_events;
354 };
355
356 // when the runtime hijack is not enabled/active, a cuCtxSynchronize
357 // is required to ensure a task's completion event covers all of its
358 // actions - rather than blocking an important thread, we create a
359 // small thread pool to handle these
384
385 struct FatBin;
386 struct RegisteredVariable;
387 struct RegisteredFunction;
388
389 // a GPU object represents our use of a given CUDA-capable GPU - this will
390 // have an associated CUDA context, a (possibly shared) worker thread, a
391 // processor, and an FB memory (the ZC memory is shared across all GPUs)
392 class GPU {
393 public:
394 GPU(CudaModule *_module, GPUInfo *_info, GPUWorker *worker, CUcontext _context);
395 ~GPU(void);
396
397 void push_context(void);
398 void pop_context(void);
399
401
402 void create_processor(RuntimeImpl *runtime, size_t stack_size);
403 void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size);
404 void create_dynamic_fb_memory(RuntimeImpl *runtime, size_t max_size);
405
407
408 bool can_access_peer(const GPU *peer) const;
409
410 GPUStream *find_stream(CUstream stream) const;
412 get_null_task_stream(void) const; // needed by librealm_kokkos.so
413 GPUStream *get_next_task_stream(bool create = false);
415
416 void launch_batch_affine_fill_kernel(void *fill_info, size_t dim, size_t elemSize,
417 size_t volume, GPUStream *stream);
418 void launch_batch_affine_kernel(void *copy_info, size_t dim, size_t elemSize,
419 size_t volume, bool multified_optimized,
420 GPUStream *stream);
422 size_t elemSize, GPUStream *stream);
423
424 void launch_indirect_copy_kernel(void *copy_info, size_t dim, size_t addr_size,
425 size_t field_size, size_t volume,
426 GPUStream *stream);
427 bool is_accessible_host_mem(const MemoryImpl *mem) const;
428 bool is_accessible_gpu_mem(const MemoryImpl *mem) const;
429
431 ReductionOpID redop_id, CUfunction apply_excl, CUfunction apply_nonexcl,
432 CUfunction fold_excl, CUfunction fold_nonexcl, CUfunction apply_excl_advanced,
433 CUfunction apply_nonexcl_advanced, CUfunction fold_excl_advanced,
434 CUfunction fold_nonexcl_advanced, CUfunction apply_excl_transpose,
435 CUfunction apply_nonexcl_transpose, CUfunction fold_excl_transpose,
436 CUfunction fold_nonexcl_transpose);
437
438 protected:
439 CUmodule load_cuda_module(const void *data);
440
441 public:
443 CudaModule *module = nullptr;
444 GPUInfo *info = nullptr;
445 GPUWorker *worker = nullptr;
446 GPUProcessor *proc = nullptr;
447
448 std::map<CUdeviceptr, GPUAllocation> allocations;
449 GPUFBMemory *fbmem = nullptr;
452
453 CUcontext context = nullptr;
454
455 CUmodule device_module = nullptr;
456
457 struct GPUFuncInfo {
458 CUfunction func;
461 };
462
463 // The maximum value of log2(type_bytes) that cuda kernels handle.
464 // log2(1 byte) --> 0
465 // log2(2 bytes) --> 1
466 // log2(4 bytes) --> 2
467 // log2(8 bytes) --> 3
468 // log2(16 bytes) --> 4
469 static const size_t CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES = 5;
470
481
482 CUdeviceptr fbmem_base = 0;
483
484 CUdeviceptr fb_ibmem_base = 0;
485
486 // which system memories have been registered and can be used for cuMemcpyAsync
487 std::set<Memory> pinned_sysmems;
488
489 // managed memories we can concurrently access
490 std::set<Memory> managed_mems;
491
492 // which other FBs we have peer access to
493 std::set<Memory> peer_fbs;
494
495 // streams for different copy types and a pile for actual tasks
499 std::vector<GPUStream *> device_to_device_streams;
500 std::vector<GPUStream *> peer_to_peer_streams; // indexed by target
501 std::vector<GPUStream *> task_streams;
505
507
508 // this can technically be different in each context (but probably isn't
509 // in practice)
511
516 uintptr_t local_base;
517 uintptr_t address_offset; // add to convert from original to local base
518 };
519 std::vector<CudaIpcMapping> cudaipc_mappings;
520 std::map<NodeID, GPUStream *> cudaipc_streams;
523
525 CUfunction apply_nonexcl = nullptr;
526 CUfunction apply_excl = nullptr;
527 CUfunction fold_nonexcl = nullptr;
528 CUfunction fold_excl = nullptr;
529 CUfunction apply_nonexcl_advanced = nullptr;
530 CUfunction apply_excl_advanced = nullptr;
531 CUfunction fold_nonexcl_advanced = nullptr;
532 CUfunction fold_excl_advanced = nullptr;
533 CUfunction apply_nonexcl_transpose = nullptr;
534 CUfunction apply_excl_transpose = nullptr;
535 CUfunction fold_nonexcl_transpose = nullptr;
536 CUfunction fold_excl_transpose = nullptr;
537 };
538
539 std::unordered_map<ReductionOpID, GPUReductionOpEntry> gpu_reduction_table;
540 };
541
542 // helper to push/pop a GPU's context by scope
544 public:
548
549 protected:
551 };
552
553 class REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUProcessor // needed by librealm_kokkos.so
555 public:
556 GPUProcessor(RuntimeImpl *runtime_impl, GPU *_gpu, Processor _me,
557 Realm::CoreReservationSet &crs, size_t _stack_size);
558 virtual ~GPUProcessor(void);
559
560 public:
561 virtual bool register_task(Processor::TaskFuncID func_id, CodeDescriptor &codedesc,
562 const ByteArrayRef &user_data);
563
564 virtual void shutdown(void);
565
566 protected:
568 const ByteArrayRef &task_args);
569
570 public:
572
573 protected:
575
577 Processor::TaskFuncPtr fnptr;
578 Cuda::StreamAwareTaskFuncPtr stream_aware_fnptr;
580 };
581
582 // we're not using the parent's task table, but we can use the mutex
583 // RWLock task_table_mutex;
584 std::map<Processor::TaskFuncID, GPUTaskTableEntry> gpu_task_table;
585 };
586
587 // this can be attached to any MemoryImpl if the underlying memory is
588 // guaranteed to belong to a given CUcontext - this will allow that
589 // context's processor and dma channels to work with it
590 // the creator is expected to know what CUcontext they want but need
591 // not know which GPU object that corresponds to
593 public:
594 CudaDeviceMemoryInfo(CUcontext _context);
595
596 CUcontext context;
598 };
599
601 public:
602 GPUFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base,
603 size_t _size);
604
605 virtual ~GPUFBMemory(void);
606
607 // these work, but they are SLOW
608 virtual void get_bytes(off_t offset, void *dst, size_t size);
609 virtual void put_bytes(off_t offset, const void *src, size_t size);
610
611 virtual void *get_direct_ptr(off_t offset, size_t size);
612
613 // GPUFBMemory supports ExternalCudaMemoryResource and
614 // ExternalCudaArrayResource
616 size_t &inst_offset);
618
619 // for re-registration purposes, generate an ExternalInstanceResource *
620 // (if possible) for a given instance, or a subset of one
623 span<const FieldID> fields, bool read_only);
624
625 public:
627 CUdeviceptr base;
629 };
630
632 public:
633 GPUDynamicFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu,
634 size_t _max_size);
635
636 virtual ~GPUDynamicFBMemory(void);
637 void cleanup(void);
638
639 // deferred allocation not supported
641 bool need_alloc_result,
642 bool poisoned,
643 TimeLimit work_until);
644
645 virtual void release_storage_immediate(RegionInstanceImpl *inst, bool poisoned,
646 TimeLimit work_until);
647
648 // these work, but they are SLOW
649 virtual void get_bytes(off_t offset, void *dst, size_t size);
650 virtual void put_bytes(off_t offset, const void *src, size_t size);
651
652 virtual void *get_direct_ptr(off_t offset, size_t size);
653
654 // GPUDynamicFBMemory supports ExternalCudaMemoryResource and
655 // ExternalCudaArrayResource
657 size_t &inst_offset);
659
660 // for re-registration purposes, generate an ExternalInstanceResource *
661 // (if possible) for a given instance, or a subset of one
664 span<const FieldID> fields, bool read_only);
665
666 public:
669 size_t cur_size;
670 std::map<RegionInstance, std::pair<CUdeviceptr, size_t>> alloc_bases;
672 };
673
675 public:
676 GPUZCMemory(RuntimeImpl *_runtime_impl, GPU *gpu, Memory _me, CUdeviceptr _gpu_base,
677 void *_cpu_base, size_t _size, MemoryKind _kind,
678 Memory::Kind _lowlevel_kind);
679
680 virtual ~GPUZCMemory(void);
681
682 virtual void get_bytes(off_t offset, void *dst, size_t size);
683
684 virtual void put_bytes(off_t offset, const void *src, size_t size);
685
686 virtual void *get_direct_ptr(off_t offset, size_t size);
687
688 // GPUZCMemory supports ExternalCudaPinnedHostResource
690 size_t &inst_offset);
692
693 // for re-registration purposes, generate an ExternalInstanceResource *
694 // (if possible) for a given instance, or a subset of one
697 span<const FieldID> fields, bool read_only);
698
699 public:
700 CUdeviceptr gpu_base;
701 char *cpu_base;
703 };
704
705 class GPUFBIBMemory : public IBMemory {
706 public:
707 GPUFBIBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base,
708 size_t _size);
709
710 public:
712 CUdeviceptr base;
714 };
715
716 class GPURequest;
717
719 public:
721
723 };
724
725 class GPURequest : public Request {
726 public:
727 const void *src_base;
728 void *dst_base;
729 // off_t src_gpu_off, dst_gpu_off;
732 };
733
735 public:
737 XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size,
738 int _write_port_idx, size_t _write_offset, size_t _write_size,
739 int _read_ind_port_idx = -1, size_t _read_ind_offset = 0,
740 size_t _read_ind_size = 0, int _write_ind_port_idx = -1,
741 size_t _write_ind_offset = 0, size_t _write_ind_size = 0);
742
743 virtual void request_completed(void);
744
745 protected:
755 };
756
758 public:
759 GPUTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset,
760 size_t _read_size, int _write_port_idx, size_t _write_offset,
761 size_t _write_size);
762
763 virtual void request_completed(void);
764
765 protected:
771 };
772
774 public:
775 MemSpecificCudaArray(CUarray _array);
777
778 CUarray array;
779 };
780
782 public:
783 virtual int set_rect(const RegionInstanceImpl *inst,
784 const InstanceLayoutPieceBase *piece, size_t field_size,
785 size_t field_offset, int ndims, const int64_t lo[/*ndims*/],
786 const int64_t hi[/*ndims*/], const int order[/*ndims*/]);
787
788 CUarray array;
789 int dim;
790 size_t pos[3];
792 };
793
794 class GPUChannel;
795
796 class GPUXferDes : public XferDes {
797 public:
798 GPUXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,
799 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,
800 const std::vector<XferDesPortInfo> &outputs_info, int _priority);
801
802 long get_requests(Request **requests, long nr);
803
805
806 static size_t read_address_entry(AffineCopyInfo<3> &copy_infos, size_t &min_align,
807 MemcpyTransposeInfo<size_t> &transpose_info,
808 AddressListCursor &in_alc, uintptr_t in_base,
809 AddressListCursor &out_alc, uintptr_t out_base,
810 size_t bytes_left, size_t max_xfer_fields,
811 size_t &fields_total);
812
813 private:
814 std::vector<GPU *> src_gpus, dst_gpus;
815 std::vector<bool> dst_is_ipc;
816
817 // Mininum amount to transfer in a single quantum before returning in order to
818 // ensure forward progress
819 // TODO: make controllable
820 static constexpr size_t min_xfer_size = 4 << 20;
821 // Maximum amount to transfer in a single quantum in order to ensure other requests
822 // have a chance to make forward progress. This should be large enough that the
823 // overhead of splitting the copy shouldn't be noticable in terms of latency (4GiB
824 // should be good here for most purposes)
825 // TODO: make controllable
826 static constexpr size_t max_xfer_size = 4ULL * 1024ULL * 1024ULL * 1024ULL;
827 static constexpr size_t max_xfer_fields = 2000;
828 };
829
830 class GPUIndirectChannel;
831
833 public:
834 GPUIndirectXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,
835 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,
836 const std::vector<XferDesPortInfo> &outputs_info, int _priority,
837 XferDesRedopInfo _redop_info);
838
839 long get_requests(Request **requests, long nr);
841
842 protected:
843 std::vector<GPU *> src_gpus, dst_gpus;
844 std::vector<bool> dst_is_ipc;
845 };
846
848 : public SingleXDQChannel<GPUIndirectChannel, GPUIndirectXferDes> {
849 public:
852
853 // multi-threading of cuda copies for a given device is disabled by
854 // default (can be re-enabled with -cuda:mtdma 1)
855 static const bool is_ordered = true;
856
857 virtual bool needs_wrapping_iterator() const;
859
861
862 virtual uint64_t
863 supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id,
864 CustomSerdezID dst_serdez_id, ReductionOpID redop_id,
865 size_t total_bytes, const std::vector<size_t> *src_frags,
866 const std::vector<size_t> *dst_frags, XferDesKind *kind_ret = 0,
867 unsigned *bw_ret = 0, unsigned *lat_ret = 0);
868
869 virtual bool supports_indirection_memory(Memory mem) const;
870
871 virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,
872 XferDesID guid,
873 const std::vector<XferDesPortInfo> &inputs_info,
874 const std::vector<XferDesPortInfo> &outputs_info,
875 int priority, XferDesRedopInfo redop_info,
876 const void *fill_data, size_t fill_size,
877 size_t fill_total);
878
879 long submit(Request **requests, long nr);
880 GPU *get_gpu() const { return src_gpu; }
881
882 protected:
883 friend class GPUIndirectXferDes;
885 };
886
888 public:
890 uintptr_t _remote_ptr,
891 const std::vector<Channel::SupportedPath> &_paths,
892 const std::vector<Memory> &_indirect_memories);
893
895
896 template <typename S>
897 bool serialize(S &serializer) const;
898
899 template <typename S>
900 static RemoteChannelInfo *deserialize_new(S &deserializer);
901
902 protected:
906 };
907
910
911 public:
912 GPUIndirectRemoteChannel(uintptr_t _remote_ptr,
913 const std::vector<Memory> &_indirect_memories);
915 virtual bool needs_wrapping_iterator() const;
916 virtual uint64_t
917 supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id,
918 CustomSerdezID dst_serdez_id, ReductionOpID redop_id,
919 size_t total_bytes, const std::vector<size_t> *src_frags,
920 const std::vector<size_t> *dst_frags, XferDesKind *kind_ret /*= 0*/,
921 unsigned *bw_ret /*= 0*/, unsigned *lat_ret /*= 0*/);
922 };
923
924 class GPUChannel : public SingleXDQChannel<GPUChannel, GPUXferDes> {
925 public:
928
929 // multi-threading of cuda copies for a given device is disabled by
930 // default (can be re-enabled with -cuda:mtdma 1)
931 static const bool is_ordered = true;
932
933 virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,
934 XferDesID guid,
935 const std::vector<XferDesPortInfo> &inputs_info,
936 const std::vector<XferDesPortInfo> &outputs_info,
937 int priority, XferDesRedopInfo redop_info,
938 const void *fill_data, size_t fill_size,
939 size_t fill_total);
940
941 long submit(Request **requests, long nr);
942 GPU *get_gpu() const { return src_gpu; }
943
945
946 virtual bool support_idindexed_fields(Memory src_mem, Memory dst_mem) const
947 {
948 return true;
949 }
950
951 private:
952 GPU *src_gpu;
953 };
954
956 public:
957 GPURemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr,
958 const std::vector<Channel::SupportedPath> &_paths);
959
961
962 template <typename S>
963 bool serialize(S &serializer) const;
964
965 template <typename S>
966 static RemoteChannelInfo *deserialize_new(S &deserializer);
967
968 protected:
972 };
973
976
977 GPURemoteChannel(uintptr_t _remote_ptr);
978
979 public:
980 virtual bool support_idindexed_fields(Memory src_mem, Memory dst_mem) const
981 {
982 return true;
983 }
984 };
985
986 class GPUfillChannel;
987
988 class GPUfillXferDes : public XferDes {
989 public:
990 GPUfillXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,
991 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,
992 const std::vector<XferDesPortInfo> &outputs_info, int _priority,
993 const void *_fill_data, size_t _fill_size, size_t _fill_total);
994
995 long get_requests(Request **requests, long nr);
996
998
999 protected:
1001 };
1002
1003 class GPUfillChannel : public SingleXDQChannel<GPUfillChannel, GPUfillXferDes> {
1004 public:
1006
1007 // multiple concurrent cuda fills ok
1008 static const bool is_ordered = false;
1009
1010 virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,
1011 XferDesID guid,
1012 const std::vector<XferDesPortInfo> &inputs_info,
1013 const std::vector<XferDesPortInfo> &outputs_info,
1014 int priority, XferDesRedopInfo redop_info,
1015 const void *fill_data, size_t fill_size,
1016 size_t fill_total);
1017
1018 long submit(Request **requests, long nr);
1019
1020 protected:
1021 friend class GPUfillXferDes;
1022
1024 };
1025
1026 class GPUreduceChannel;
1027
1032
1033 class GPUreduceXferDes : public XferDes {
1034 public:
1035 GPUreduceXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,
1036 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,
1037 const std::vector<XferDesPortInfo> &outputs_info, int _priority,
1038 XferDesRedopInfo _redop_info);
1039
1040 long get_requests(Request **requests, long nr);
1041
1044 XferPort *in_port, XferPort *out_port,
1045 const size_t in_span_start,
1046 const size_t out_span_start);
1047
1049 const size_t in_span_start, const size_t out_span_start,
1050 const size_t in_elem_size, const size_t out_elem_size,
1051 const size_t elems, const bool has_transpose);
1053
1055 bool resolve_kernel_slot(GPU *gpu, void *host_proxy, CUfunction &kernel_out,
1056 CUfunction GPU::GPUReductionOpEntry::*cache_field);
1057
1058 protected:
1061 CUfunction kernel;
1068 std::vector<GPU *> src_gpus;
1069 std::vector<bool> src_is_ipc;
1070 };
1071
1072 class GPUreduceChannel : public SingleXDQChannel<GPUreduceChannel, GPUreduceXferDes> {
1073 public:
1075
1076 // multiple concurrent cuda reduces ok
1077 static const bool is_ordered = false;
1078
1079 // helper method here so that GPUreduceRemoteChannel can use it too
1080 bool supports_redop(ReductionOpID redop_id) const override;
1081
1083
1084 XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid,
1085 const std::vector<XferDesPortInfo> &inputs_info,
1086 const std::vector<XferDesPortInfo> &outputs_info,
1087 int priority, XferDesRedopInfo redop_info,
1088 const void *fill_data, size_t fill_size,
1089 size_t fill_total) override;
1090
1091 long submit(Request **requests, long nr) override;
1092
1093 protected:
1094 friend class GPUreduceXferDes;
1095
1097 };
1098
1100 public:
1101 GPUreduceRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr,
1102 const std::vector<Channel::SupportedPath> &_paths);
1103
1105
1106 template <typename S>
1107 bool serialize(S &serializer) const;
1108
1109 template <typename S>
1110 static RemoteChannelInfo *deserialize_new(S &deserializer);
1111
1112 protected:
1116 };
1117
1120
1121 GPUreduceRemoteChannel(uintptr_t _remote_ptr);
1122 };
1123
1124 // active message for establishing cuda ipc mappings
1126 unsigned count = 0;
1127#if !defined(REALM_IS_WINDOWS)
1128 long hostid = 0;
1129#endif
1130 static void handle_message(NodeID sender, const CudaIpcImportRequest &args,
1131 const void *data, size_t datalen);
1132 };
1133
1135 public:
1137
1138 virtual void chunk_created(void *base, size_t bytes);
1139 virtual void chunk_destroyed(void *base, size_t bytes);
1140
1141 protected:
1142 CudaModule *module;
1143 };
1144
1149 public:
1150 // -- Constructors --
1151 GPUAllocation(void) = default;
1152 GPUAllocation(GPUAllocation &&other) noexcept;
1153 GPUAllocation(const GPUAllocation &) = delete;
1155 GPUAllocation &operator=(const GPUAllocation &) = delete;
1157
1158 // --- Accessors ---
1159 inline operator bool(void) const { return dev_ptr != 0; }
1166
1171 inline bool get_ipc_handle(CUipcMemHandle &handle) const
1172 {
1173 if(has_ipc_handle) {
1174 handle = ipc_handle;
1175 }
1176 return has_ipc_handle;
1177 }
1178#if CUDA_VERSION >= 12030
1183 bool get_fabric_handle(CUmemFabricHandle &handle) const;
1184#endif
1193 inline CUdeviceptr get_dptr(void) const { return dev_ptr; }
1196 inline GPU *get_gpu(void) const { return gpu; }
1199 inline size_t get_size(void) const { return size; }
1200
1206 template <typename T = void>
1207 T *get_hptr(void) const
1208 {
1209 return static_cast<T *>(host_ptr);
1210 }
1211
1218
1219 // -- Allocators --
1220
1230 static GPUAllocation *allocate_dev(GPU *gpu, size_t size, bool peer_enabled = true,
1231 bool shareable = true);
1232#if CUDA_VERSION >= 11000
1246 static GPUAllocation *allocate_mmap(GPU *gpu, const CUmemAllocationProp &prop,
1247 size_t size, CUdeviceptr vaddr = 0,
1248 bool peer_enabled = true);
1249#endif
1261 static GPUAllocation *allocate_host(GPU *gpu, size_t size, bool peer_enabled = true,
1262 bool shareable = true, bool same_va = true);
1269 static GPUAllocation *allocate_managed(GPU *gpu, size_t size);
1281 static GPUAllocation *register_allocation(GPU *gpu, void *ptr, size_t size,
1282 bool peer_enabled = true);
1288 static GPUAllocation *open_ipc(GPU *gpu, const CUipcMemHandle &mem_hdl);
1297 static GPUAllocation *open_handle(GPU *gpu, OsHandle hdl, size_t size,
1298 bool peer_enabled = true);
1299#if CUDA_VERSION >= 12030
1310 static GPUAllocation *open_fabric(GPU *gpu, const CUmemFabricHandle &hdl,
1311 size_t size, bool peer_enabled = true,
1312 bool is_local = false);
1313#endif
1314
1315 private:
1316 CUresult map_allocation(GPU *gpu, CUmemGenericAllocationHandle handle, size_t size,
1317 CUdeviceptr va = 0, size_t offset = 0,
1318 bool peer_enabled = false, bool map_host = false);
1319
1320#if CUDA_VERSION >= 11000
1326 static size_t align_size(const CUmemAllocationProp &prop, size_t size);
1327#endif
1328 // -- Deleters --
1329 typedef void (*DeleterCallback)(GPUAllocation &alloc);
1330
1331 // These are helper functions to manage what freeing strategy needs to be used to
1332 // properly free the allocation
1333 static void cuda_malloc_free(GPUAllocation &alloc);
1334 static void cuda_malloc_host_free(GPUAllocation &alloc);
1335 static void cuda_register_free(GPUAllocation &alloc);
1336 static void cuda_ipc_free(GPUAllocation &alloc);
1337#if CUDA_VERSION >= 11000
1338 static void cuda_memmap_free(GPUAllocation &alloc);
1339#endif
1340
1341 // -- Members --
1343 GPU *gpu = nullptr;
1345 CUdeviceptr dev_ptr = 0;
1347 void *host_ptr = nullptr;
1349 size_t size = 0;
1351 DeleterCallback deleter = nullptr;
1352#if CUDA_VERSION >= 11000
1354 CUmemGenericAllocationHandle mmap_handle = 0;
1355 // True if VA needs to be released for cuMemMap'ed memory
1356 // or if the registered memory actually needs to be unregistered
1357 bool owns_va = true;
1358#endif
1360 bool has_ipc_handle = false;
1362 CUipcMemHandle ipc_handle;
1363 };
1364
1365 // Define these APIs locally here if we know the definition isn't in cuda.h. This
1366 // allows us to use this driver function even if it is unavailable to our current
1367 // toolkit
1368
1369#if CUDA_VERSION < 11030
1370#define CU_GET_PROC_ADDRESS_DEFAULT 0
1371 CUresult cuGetProcAddress(const char *, void **, int, int);
1372#endif
1373
1374#if CUDA_VERSION < 12050
1375 CUresult cuCtxRecordEvent(CUcontext hctx, CUevent event);
1376#endif
1377
1378#if CUDA_VERSION >= 13000
1379// Unfortunately, 13.0 violates it's own source compatibility rules versus
1380// cuGetProcAddress, so fix that ourselves here.
1381#if !defined(cuCtxGetDevice)
1382#define cuCtxGetDevice cuCtxGetDevice_v2
1383#endif
1384#if !defined(cuCtxSynchronize)
1385#define cuCtxSynchronize cuCtxSynchronize_v2
1386#endif
1387#if !defined(cuStreamGetCtx)
1388#define cuStreamGetCtx cuStreamGetCtx_v2
1389#endif
1390#endif
1391
1392 // cuda driver and/or runtime entry points
1393#define CUDA_DRIVER_HAS_FNPTR(name) ((name##_fnptr) != nullptr)
1394#define CUDA_DRIVER_FNPTR(name) (assert(name##_fnptr != nullptr), name##_fnptr)
1395
1396// Only APIs that are available in the minimum base driver version that Realm supports
1397// should be listed here
1398
1399// Note: it is imperative for APIs introduced in minor versions after
1400// the minimum version defined above to explicitly denote the version they were
1401// introduced, otherwise it is possible to retrieve the wrong API and crash when called.
1402
1403// The mininum base driver version Realm supports
1404#define CUDA_VERSION_MIN 11080
1405// Source compatible version of cuda.h (the minimum version where the decltype(&fn)
1406// matches the function returned from cuGetProcAddress(fn, CUDA_VERSION_COMPAT) )
1407#define CUDA_VERSION_COMPAT ((CUDA_VERSION / 1000) * 1000)
1408
1409#define CUDA_DRIVER_APIS(__op__) \
1410 __op__(cuModuleGetFunction, CUDA_VERSION_MIN); \
1411 __op__(cuCtxGetDevice, CUDA_VERSION_MIN); \
1412 __op__(cuCtxEnablePeerAccess, CUDA_VERSION_MIN); \
1413 __op__(cuCtxGetFlags, CUDA_VERSION_MIN); \
1414 __op__(cuCtxGetStreamPriorityRange, CUDA_VERSION_MIN); \
1415 __op__(cuCtxPopCurrent, CUDA_VERSION_MIN); \
1416 __op__(cuCtxPushCurrent, CUDA_VERSION_MIN); \
1417 __op__(cuCtxSynchronize, CUDA_VERSION_MIN); \
1418 __op__(cuDeviceCanAccessPeer, CUDA_VERSION_MIN); \
1419 __op__(cuDeviceGet, CUDA_VERSION_MIN); \
1420 __op__(cuDeviceGetUuid, CUDA_VERSION_MIN); \
1421 __op__(cuDeviceGetAttribute, CUDA_VERSION_MIN); \
1422 __op__(cuDeviceGetCount, CUDA_VERSION_MIN); \
1423 __op__(cuDeviceGetName, CUDA_VERSION_MIN); \
1424 __op__(cuDevicePrimaryCtxRelease, CUDA_VERSION_MIN); \
1425 __op__(cuDevicePrimaryCtxRetain, CUDA_VERSION_MIN); \
1426 __op__(cuDevicePrimaryCtxSetFlags, CUDA_VERSION_MIN); \
1427 __op__(cuDeviceTotalMem, CUDA_VERSION_MIN); \
1428 __op__(cuEventCreate, CUDA_VERSION_MIN); \
1429 __op__(cuEventDestroy, CUDA_VERSION_MIN); \
1430 __op__(cuEventQuery, CUDA_VERSION_MIN); \
1431 __op__(cuEventRecord, CUDA_VERSION_MIN); \
1432 __op__(cuGetErrorName, CUDA_VERSION_MIN); \
1433 __op__(cuGetErrorString, CUDA_VERSION_MIN); \
1434 __op__(cuInit, CUDA_VERSION_MIN); \
1435 __op__(cuIpcCloseMemHandle, CUDA_VERSION_MIN); \
1436 __op__(cuIpcGetMemHandle, CUDA_VERSION_MIN); \
1437 __op__(cuIpcOpenMemHandle, CUDA_VERSION_MIN); \
1438 __op__(cuLaunchKernel, CUDA_VERSION_MIN); \
1439 __op__(cuMemAllocManaged, CUDA_VERSION_MIN); \
1440 __op__(cuMemAlloc, CUDA_VERSION_MIN); \
1441 __op__(cuMemcpy2DAsync, CUDA_VERSION_MIN); \
1442 __op__(cuMemcpy3DAsync, CUDA_VERSION_MIN); \
1443 __op__(cuMemcpyAsync, CUDA_VERSION_MIN); \
1444 __op__(cuMemcpyDtoDAsync, CUDA_VERSION_MIN); \
1445 __op__(cuMemcpyDtoH, CUDA_VERSION_MIN); \
1446 __op__(cuMemcpyDtoHAsync, CUDA_VERSION_MIN); \
1447 __op__(cuMemcpyHtoD, CUDA_VERSION_MIN); \
1448 __op__(cuMemcpyHtoDAsync, CUDA_VERSION_MIN); \
1449 __op__(cuMemFreeHost, CUDA_VERSION_MIN); \
1450 __op__(cuMemFree, CUDA_VERSION_MIN); \
1451 __op__(cuMemGetInfo, CUDA_VERSION_MIN); \
1452 __op__(cuMemHostAlloc, CUDA_VERSION_MIN); \
1453 __op__(cuMemHostGetDevicePointer, CUDA_VERSION_MIN); \
1454 __op__(cuMemHostRegister, CUDA_VERSION_MIN); \
1455 __op__(cuMemHostUnregister, CUDA_VERSION_MIN); \
1456 __op__(cuMemsetD16Async, CUDA_VERSION_MIN); \
1457 __op__(cuMemsetD2D16Async, CUDA_VERSION_MIN); \
1458 __op__(cuMemsetD2D32Async, CUDA_VERSION_MIN); \
1459 __op__(cuMemsetD2D8Async, CUDA_VERSION_MIN); \
1460 __op__(cuMemsetD32Async, CUDA_VERSION_MIN); \
1461 __op__(cuMemsetD8Async, CUDA_VERSION_MIN); \
1462 __op__(cuModuleLoadDataEx, CUDA_VERSION_MIN); \
1463 __op__(cuStreamAddCallback, CUDA_VERSION_MIN); \
1464 __op__(cuStreamCreate, CUDA_VERSION_MIN); \
1465 __op__(cuStreamCreateWithPriority, CUDA_VERSION_MIN); \
1466 __op__(cuStreamDestroy, CUDA_VERSION_MIN); \
1467 __op__(cuStreamSynchronize, CUDA_VERSION_MIN); \
1468 __op__(cuOccupancyMaxPotentialBlockSize, CUDA_VERSION_MIN); \
1469 __op__(cuOccupancyMaxPotentialBlockSizeWithFlags, CUDA_VERSION_MIN); \
1470 __op__(cuEventSynchronize, CUDA_VERSION_MIN); \
1471 __op__(cuEventElapsedTime, CUDA_VERSION_MIN); \
1472 __op__(cuOccupancyMaxActiveBlocksPerMultiprocessor, CUDA_VERSION_MIN); \
1473 __op__(cuMemAddressReserve, CUDA_VERSION_MIN); \
1474 __op__(cuMemAddressFree, CUDA_VERSION_MIN); \
1475 __op__(cuMemCreate, CUDA_VERSION_MIN); \
1476 __op__(cuMemRelease, CUDA_VERSION_MIN); \
1477 __op__(cuMemMap, CUDA_VERSION_MIN); \
1478 __op__(cuMemUnmap, CUDA_VERSION_MIN); \
1479 __op__(cuMemSetAccess, CUDA_VERSION_MIN); \
1480 __op__(cuMemGetAllocationGranularity, CUDA_VERSION_MIN); \
1481 __op__(cuMemGetAllocationPropertiesFromHandle, CUDA_VERSION_MIN); \
1482 __op__(cuMemExportToShareableHandle, CUDA_VERSION_MIN); \
1483 __op__(cuMemImportFromShareableHandle, CUDA_VERSION_MIN); \
1484 __op__(cuStreamWaitEvent, CUDA_VERSION_MIN); \
1485 __op__(cuStreamQuery, CUDA_VERSION_MIN); \
1486 __op__(cuMemGetAddressRange, CUDA_VERSION_MIN); \
1487 __op__(cuPointerGetAttributes, CUDA_VERSION_MIN); \
1488 __op__(cuDriverGetVersion, CUDA_VERSION_MIN); \
1489 __op__(cuMemAdvise, CUDA_VERSION_MIN); \
1490 __op__(cuMemPrefetchAsync, CUDA_VERSION_MIN); \
1491 __op__(cuCtxSetSharedMemConfig, CUDA_VERSION_MIN); \
1492 __op__(cuCtxSetCacheConfig, CUDA_VERSION_MIN); \
1493 __op__(cuCtxSetLimit, CUDA_VERSION_MIN); \
1494 __op__(cuCtxGetLimit, CUDA_VERSION_MIN); \
1495 __op__(cuFuncSetAttribute, CUDA_VERSION_MIN); \
1496 __op__(cuFuncSetCacheConfig, CUDA_VERSION_MIN); \
1497 __op__(cuFuncSetSharedMemConfig, CUDA_VERSION_MIN); \
1498 __op__(cuFuncGetAttribute, CUDA_VERSION_MIN); \
1499 __op__(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, CUDA_VERSION_MIN); \
1500 __op__(cuArray3DCreate, CUDA_VERSION_MIN); \
1501 __op__(cuArrayDestroy, CUDA_VERSION_MIN); \
1502 __op__(cuSurfObjectCreate, CUDA_VERSION_MIN); \
1503 __op__(cuSurfObjectDestroy, CUDA_VERSION_MIN); \
1504 __op__(cuLaunchCooperativeKernel, CUDA_VERSION_MIN); \
1505 __op__(cuModuleGetGlobal, CUDA_VERSION_MIN); \
1506 __op__(cuLaunchHostFunc, CUDA_VERSION_MIN); \
1507 __op__(cuCtxRecordEvent, 12050); \
1508 __op__(cuArrayGetMemoryRequirements, CUDA_VERSION_MIN);
1509
1510// Make sure to only use decltype, to ensure it matches the cuda.h definition
1511#define DECL_FNPTR_EXTERN(name, ver) extern decltype(&name) name##_fnptr;
1513#undef DECL_FNPTR_EXTERN
1514
1515#define NVML_FNPTR(name) (name##_fnptr)
1516
1517#if NVML_API_VERSION >= 11
1518#define NVML_11_APIS(__op__) __op__(nvmlDeviceGetMemoryAffinity);
1519#else
1520#define NVML_11_APIS(__op__)
1521#endif
1522
1523#if NVML_API_VERSION >= 12
1524#define NVML_12_APIS(__op__) __op__(nvmlDeviceGetGpuFabricInfo)
1525#else
1526#define NVML_12_APIS(__op__)
1527#endif
1528
1529#if CUDA_VERSION < 11040
1530 // Define an NVML api that doesn't exist prior to CUDA Toolkit 11.5, but should
1531 // exist in systems that require it that we need to support (we'll detect it's
1532 // availability later)
1533 //
1534 // Although these are NVML apis, NVML_API_VERSION doesn't support any way to detect
1535 // minor versioning, so we'll use the cuda header's versioning here, which should
1536 // coincide with the versions we're looking for
1544
1545 nvmlReturn_t
1546 nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link,
1547 nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType);
1548#endif
1549
1550#define NVML_APIS(__op__) \
1551 __op__(nvmlInit); \
1552 __op__(nvmlDeviceGetHandleByUUID); \
1553 __op__(nvmlDeviceGetMaxPcieLinkWidth); \
1554 __op__(nvmlDeviceGetMaxPcieLinkGeneration); \
1555 __op__(nvmlDeviceGetNvLinkState); \
1556 __op__(nvmlDeviceGetNvLinkVersion); \
1557 __op__(nvmlDeviceGetNvLinkRemotePciInfo); \
1558 __op__(nvmlDeviceGetNvLinkRemoteDeviceType); \
1559 __op__(nvmlDeviceGetDeviceHandleFromMigDeviceHandle); \
1560 __op__(nvmlDeviceGetFieldValues); \
1561 NVML_11_APIS(__op__); \
1562 NVML_12_APIS(__op__);
1563
1564#define DECL_FNPTR_EXTERN(name) extern decltype(&name) name##_fnptr;
1566#undef DECL_FNPTR_EXTERN
1567
1568#define CUPTI_APIS(__op__) \
1569 __op__(cuptiActivityRegisterCallbacks); \
1570 __op__(cuptiActivityEnable); \
1571 __op__(cuptiActivityDisable); \
1572 __op__(cuptiActivityEnableContext); \
1573 __op__(cuptiActivityDisableContext); \
1574 __op__(cuptiActivityFlushAll); \
1575 __op__(cuptiActivityGetNextRecord); \
1576 __op__(cuptiActivityRegisterTimestampCallback); \
1577 __op__(cuptiActivityPushExternalCorrelationId); \
1578 __op__(cuptiActivityPopExternalCorrelationId);
1579
1580#define DECL_FNPTR_EXTERN(name) extern decltype(&name) name##_fnptr;
1582#undef DECL_FNPTR_EXTERN
1583
1584#define CUPTI_HAS_FNPTR(name) (name##_fnptr != nullptr)
1585#define CUPTI_FNPTR(name) (assert(name##_fnptr != nullptr), name##_fnptr)
1586
1587 }; // namespace Cuda
1588
1589}; // namespace Realm
1590
1591#endif
bootstrap_handle_t * handle
Definition bootstrap.h:61
Definition address_list.h:100
Definition bgwork.h:129
Definition bgwork.h:36
Definition bytearray.h:30
Definition bytearray.h:53
Definition channel.h:713
Definition circ_queue.h:35
Definition codedesc.h:249
Definition threads.h:382
Definition threads.h:342
Definition cuda_internal.h:781
int dim
Definition cuda_internal.h:789
CUarray array
Definition cuda_internal.h:788
size_t width_in_bytes
Definition cuda_internal.h:791
size_t height
Definition cuda_internal.h:791
size_t pos[3]
Definition cuda_internal.h:790
size_t depth
Definition cuda_internal.h:791
virtual int set_rect(const RegionInstanceImpl *inst, const InstanceLayoutPieceBase *piece, size_t field_size, size_t field_offset, int ndims, const int64_t lo[], const int64_t hi[], const int order[])
Definition cuda_internal.h:543
GPU * gpu
Definition cuda_internal.h:550
Definition cuda_internal.h:360
Mutex mutex
Definition cuda_internal.h:376
std::vector< Thread * > worker_threads
Definition cuda_internal.h:381
int total_threads
Definition cuda_internal.h:380
CoreReservation * core_rsrv
Definition cuda_internal.h:382
int max_threads
Definition cuda_internal.h:375
void add_fence(GPUWorkFence *fence)
ContextSynchronizer(GPU *_gpu, CUcontext _context, CoreReservationSet &crs, int _max_threads)
GPU * gpu
Definition cuda_internal.h:373
int syncing_threads
Definition cuda_internal.h:380
Mutex::CondVar condvar
Definition cuda_internal.h:377
CUcontext context
Definition cuda_internal.h:374
int sleeping_threads
Definition cuda_internal.h:380
bool shutdown_flag
Definition cuda_internal.h:378
GPUWorkFence::FenceList fences
Definition cuda_internal.h:379
Definition cuda_internal.h:592
CudaDeviceMemoryInfo(CUcontext _context)
GPU * gpu
Definition cuda_internal.h:597
CUcontext context
Definition cuda_internal.h:596
Definition cuda_module.h:181
Class for managing the lifetime of a given gpu allocation. As instances of this class own an underlyi...
Definition cuda_internal.h:1148
static GPUAllocation * open_handle(GPU *gpu, OsHandle hdl, size_t size, bool peer_enabled=true)
Retrieves the GPUAllocation given the OsHandle.
static GPUAllocation * register_allocation(GPU *gpu, void *ptr, size_t size, bool peer_enabled=true)
Create an allocation that registers the given CPU address range with CUDA, making it accessible from ...
static GPUAllocation * allocate_host(GPU *gpu, size_t size, bool peer_enabled=true, bool shareable=true, bool same_va=true)
Allocate CPU-located memory for the given gpu with the given size and features.
T * get_hptr(void) const
Retrieves the CPU accessible base address for the allocation, or nullptr if there is no way to access...
Definition cuda_internal.h:1207
static GPUAllocation * allocate_managed(GPU *gpu, size_t size)
Allocate migratable memory that can be used with CUDA's managed memory APIs (cuMemPrefetchAsync,...
GPUAllocation(const GPUAllocation &)=delete
size_t get_size(void) const
Retrieves the given size of the allocation.
Definition cuda_internal.h:1199
OsHandle get_os_handle(void) const
Accessor for the file descriptor or win32 HANDLE associated with the allocation. This handle can be s...
bool get_ipc_handle(CUipcMemHandle &handle) const
Retrieves the CUipcMemHandle for this allocation that can be used with GPUAllocation::open_ipc.
Definition cuda_internal.h:1171
static GPUAllocation * allocate_dev(GPU *gpu, size_t size, bool peer_enabled=true, bool shareable=true)
Allocates device-located memory for the given gpu with the given size and features.
GPUAllocation & operator=(GPUAllocation &&) noexcept
static void * get_win32_shared_attributes(void)
Retrieves the default win32 shared attributes for creating a shared object that can be set in CUmemAl...
static GPUAllocation * open_ipc(GPU *gpu, const CUipcMemHandle &mem_hdl)
Retrieves the GPUAllocation given the CUipcMemHandle.
GPUAllocation(GPUAllocation &&other) noexcept
CUdeviceptr get_dptr(void) const
Retrieves the base CUdeviceptr for the associated allocation that can be used to access the underlyin...
Definition cuda_internal.h:1193
GPU * get_gpu(void) const
Retrieves the owning GPU.
Definition cuda_internal.h:1196
Definition cuda_internal.h:924
virtual bool support_idindexed_fields(Memory src_mem, Memory dst_mem) const
Definition cuda_internal.h:946
GPUChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork)
static const bool is_ordered
Definition cuda_internal.h:931
long submit(Request **requests, long nr)
virtual RemoteChannelInfo * construct_remote_info() const
GPU * get_gpu() const
Definition cuda_internal.h:942
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
Definition cuda_internal.h:718
GPURequest * req
Definition cuda_internal.h:722
Definition cuda_internal.h:196
virtual ~GPUCompletionNotification(void)
Definition cuda_internal.h:198
virtual void request_completed(void)=0
Definition cuda_internal.h:183
void destroy_context(InternalTask *task, void *context) const override
void destroy_context(Task *task, void *context) const override
GPU * gpu
Definition cuda_internal.h:190
void * create_context(Task *task) const override
GPUContextManager(GPU *_gpu, GPUProcessor *proc)
void * create_context(InternalTask *task) const override
GPUProcessor * proc
Definition cuda_internal.h:191
Definition cuda_internal.h:631
GPU * gpu
Definition cuda_internal.h:667
virtual void release_storage_immediate(RegionInstanceImpl *inst, bool poisoned, TimeLimit work_until)
size_t cur_size
Definition cuda_internal.h:669
NetworkSegment local_segment
Definition cuda_internal.h:671
virtual void * get_direct_ptr(off_t offset, size_t size)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
virtual AllocationResult allocate_storage_immediate(RegionInstanceImpl *inst, bool need_alloc_result, bool poisoned, TimeLimit work_until)
GPUDynamicFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, size_t _max_size)
Mutex mutex
Definition cuda_internal.h:668
std::map< RegionInstance, std::pair< CUdeviceptr, size_t > > alloc_bases
Definition cuda_internal.h:670
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void get_bytes(off_t offset, void *dst, size_t size)
virtual void put_bytes(off_t offset, const void *src, size_t size)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
Definition cuda_internal.h:337
CUevent get_event(bool external=false)
Mutex mutex
Definition cuda_internal.h:351
void init_pool(int init_size=0)
int batch_size
Definition cuda_internal.h:352
void return_event(CUevent e, bool external=false)
std::vector< CUevent > available_events
Definition cuda_internal.h:353
GPUEventPool(int _batch_size=256)
int total_size
Definition cuda_internal.h:352
int current_size
Definition cuda_internal.h:352
int external_count
Definition cuda_internal.h:352
Definition cuda_internal.h:705
GPUFBIBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base, size_t _size)
NetworkSegment local_segment
Definition cuda_internal.h:713
CUdeviceptr base
Definition cuda_internal.h:712
GPU * gpu
Definition cuda_internal.h:711
Definition cuda_internal.h:600
NetworkSegment local_segment
Definition cuda_internal.h:628
GPU * gpu
Definition cuda_internal.h:626
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void * get_direct_ptr(off_t offset, size_t size)
CUdeviceptr base
Definition cuda_internal.h:627
virtual void put_bytes(off_t offset, const void *src, size_t size)
GPUFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base, size_t _size)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
virtual void get_bytes(off_t offset, void *dst, size_t size)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
virtual ~GPUFBMemory(void)
Definition cuda_internal.h:848
GPU * get_gpu() const
Definition cuda_internal.h:880
static const bool is_ordered
Definition cuda_internal.h:855
GPU * src_gpu
Definition cuda_internal.h:884
virtual bool supports_indirection_memory(Memory mem) const
Queries if a given mem can be used as an indirection buffer.
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
virtual Memory suggest_ib_memories() const
GPUIndirectChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork)
long submit(Request **requests, long nr)
virtual RemoteChannelInfo * construct_remote_info() const
virtual bool needs_wrapping_iterator() const
virtual uint64_t supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id, CustomSerdezID dst_serdez_id, ReductionOpID redop_id, size_t total_bytes, const std::vector< size_t > *src_frags, const std::vector< size_t > *dst_frags, XferDesKind *kind_ret=0, unsigned *bw_ret=0, unsigned *lat_ret=0)
Definition cuda_internal.h:887
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPUIndirectRemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:905
bool serialize(S &serializer) const
GPUIndirectRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths, const std::vector< Memory > &_indirect_memories)
virtual RemoteChannel * create_remote_channel()
static RemoteChannelInfo * deserialize_new(S &deserializer)
Definition cuda_internal.h:908
virtual bool needs_wrapping_iterator() const
virtual Memory suggest_ib_memories() const
virtual uint64_t supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id, CustomSerdezID dst_serdez_id, ReductionOpID redop_id, size_t total_bytes, const std::vector< size_t > *src_frags, const std::vector< size_t > *dst_frags, XferDesKind *kind_ret, unsigned *bw_ret, unsigned *lat_ret)
GPUIndirectRemoteChannel(uintptr_t _remote_ptr, const std::vector< Memory > &_indirect_memories)
Definition cuda_internal.h:734
size_t read_ind_offset
Definition cuda_internal.h:750
size_t write_size
Definition cuda_internal.h:752
size_t write_offset
Definition cuda_internal.h:752
int write_ind_port_idx
Definition cuda_internal.h:753
size_t read_offset
Definition cuda_internal.h:748
size_t write_ind_offset
Definition cuda_internal.h:754
size_t read_ind_size
Definition cuda_internal.h:750
GPUIndirectTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size, int _write_port_idx, size_t _write_offset, size_t _write_size, int _read_ind_port_idx=-1, size_t _read_ind_offset=0, size_t _read_ind_size=0, int _write_ind_port_idx=-1, size_t _write_ind_offset=0, size_t _write_ind_size=0)
int write_port_idx
Definition cuda_internal.h:751
size_t write_ind_size
Definition cuda_internal.h:754
int read_ind_port_idx
Definition cuda_internal.h:749
XferDes * xd
Definition cuda_internal.h:746
size_t read_size
Definition cuda_internal.h:748
int read_port_idx
Definition cuda_internal.h:747
Definition cuda_internal.h:832
bool progress_xd(GPUIndirectChannel *channel, TimeLimit work_until)
std::vector< bool > dst_is_ipc
Definition cuda_internal.h:844
std::vector< GPU * > dst_gpus
Definition cuda_internal.h:843
long get_requests(Request **requests, long nr)
std::vector< GPU * > src_gpus
Definition cuda_internal.h:843
GPUIndirectXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, XferDesRedopInfo _redop_info)
Definition cuda_internal.h:554
virtual void shutdown(void)
Realm::CoreReservation * core_rsrv
Definition cuda_internal.h:574
virtual bool register_task(Processor::TaskFuncID func_id, CodeDescriptor &codedesc, const ByteArrayRef &user_data)
virtual ~GPUProcessor(void)
GPU * gpu
Definition cuda_internal.h:571
GPUProcessor(RuntimeImpl *runtime_impl, GPU *_gpu, Processor _me, Realm::CoreReservationSet &crs, size_t _stack_size)
std::map< Processor::TaskFuncID, GPUTaskTableEntry > gpu_task_table
Definition cuda_internal.h:584
virtual void execute_task(Processor::TaskFuncID func_id, const ByteArrayRef &task_args)
Definition cuda_internal.h:955
virtual RemoteChannel * create_remote_channel()
GPURemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths)
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPURemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:971
static RemoteChannelInfo * deserialize_new(S &deserializer)
bool serialize(S &serializer) const
Definition cuda_internal.h:974
virtual bool support_idindexed_fields(Memory src_mem, Memory dst_mem) const
Definition cuda_internal.h:980
Definition cuda_internal.h:1134
GPUReplHeapListener(CudaModule *_module)
virtual void chunk_created(void *base, size_t bytes)
virtual void chunk_destroyed(void *base, size_t bytes)
Definition cuda_internal.h:725
GPUCompletionEvent event
Definition cuda_internal.h:731
void * dst_base
Definition cuda_internal.h:728
const void * src_base
Definition cuda_internal.h:727
GPU * dst_gpu
Definition cuda_internal.h:730
Definition cuda_internal.h:247
bool ok_to_submit_copy(size_t bytes, XferDes *xd)
REALM_INTERNAL_API_EXTERNAL_LINKAGE CUstream get_stream(void) const
void add_notification(GPUCompletionNotification *notification)
void add_event(CUevent event, GPUWorkFence *fence, GPUCompletionNotification *notification=NULL, GPUWorkStart *start=NULL)
Mutex mutex
Definition cuda_internal.h:281
void add_start_event(GPUWorkStart *start)
bool has_work(void) const
GPU * gpu
Definition cuda_internal.h:276
GPU * get_gpu(void) const
GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority=0)
GPUWorker * worker
Definition cuda_internal.h:277
bool reap_events(TimeLimit work_until)
void add_fence(GPUWorkFence *fence)
std::deque< PendingEvent > pending_events
Definition cuda_internal.h:291
void wait_on_streams(const std::set< GPUStream * > &other_streams)
CUstream stream
Definition cuda_internal.h:279
Definition cuda_internal.h:757
size_t read_offset
Definition cuda_internal.h:768
int write_port_idx
Definition cuda_internal.h:769
XferDes * xd
Definition cuda_internal.h:766
size_t write_size
Definition cuda_internal.h:770
size_t write_offset
Definition cuda_internal.h:770
GPUTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size, int _write_port_idx, size_t _write_offset, size_t _write_size)
virtual void request_completed(void)
size_t read_size
Definition cuda_internal.h:768
int read_port_idx
Definition cuda_internal.h:767
Definition cuda_internal.h:203
virtual void request_cancellation(void)
IntrusiveListLink< GPUWorkFence > fence_list_link
Definition cuda_internal.h:216
GPU * gpu
Definition cuda_internal.h:224
void enqueue_on_stream(GPUStream *stream)
virtual void print(std::ostream &os) const
IntrusiveList< GPUWorkFence, REALM_PMTA_USE(GPUWorkFence, fence_list_link), DummyLock > FenceList
Definition cuda_internal.h:220
GPUWorkFence(GPU *gpu, Realm::Operation *op)
REALM_PMTA_DEFN(GPUWorkFence, IntrusiveListLink< GPUWorkFence >, fence_list_link)
static void cuda_callback(CUstream stream, CUresult res, void *data)
virtual void mark_finished(bool successful)
Definition cuda_internal.h:227
void enqueue_on_stream(GPUStream *stream)
GPUWorkStart(Realm::Operation *op)
virtual void request_cancellation(void)
Definition cuda_internal.h:231
virtual void print(std::ostream &os) const
static void cuda_start_callback(CUstream stream, CUresult res, void *data)
Definition cuda_internal.h:298
virtual ~GPUWorker(void)
bool process_streams(bool sleep_on_empty)
CircularQueue< GPUStream *, 16 > ActiveStreamQueue
Definition cuda_internal.h:325
void shutdown_background_thread(void)
ActiveStreamQueue active_streams
Definition cuda_internal.h:326
void add_stream(GPUStream *s)
void start_background_thread(Realm::CoreReservationSet &crs, size_t stack_size)
Realm::CoreReservation * core_rsrv
Definition cuda_internal.h:329
Mutex::CondVar condvar
Definition cuda_internal.h:323
Mutex lock
Definition cuda_internal.h:322
atomic< bool > worker_shutdown_requested
Definition cuda_internal.h:332
bool do_work(TimeLimit work_until)
bool thread_sleeping
Definition cuda_internal.h:331
Realm::Thread * worker_thread
Definition cuda_internal.h:330
Definition cuda_internal.h:796
bool progress_xd(GPUChannel *channel, TimeLimit work_until)
GPUXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority)
static size_t read_address_entry(AffineCopyInfo< 3 > &copy_infos, size_t &min_align, MemcpyTransposeInfo< size_t > &transpose_info, AddressListCursor &in_alc, uintptr_t in_base, AddressListCursor &out_alc, uintptr_t out_base, size_t bytes_left, size_t max_xfer_fields, size_t &fields_total)
long get_requests(Request **requests, long nr)
Definition cuda_internal.h:674
NetworkSegment local_segment
Definition cuda_internal.h:702
virtual ~GPUZCMemory(void)
char * cpu_base
Definition cuda_internal.h:701
virtual void * get_direct_ptr(off_t offset, size_t size)
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
GPUZCMemory(RuntimeImpl *_runtime_impl, GPU *gpu, Memory _me, CUdeviceptr _gpu_base, void *_cpu_base, size_t _size, MemoryKind _kind, Memory::Kind _lowlevel_kind)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
virtual void put_bytes(off_t offset, const void *src, size_t size)
CUdeviceptr gpu_base
Definition cuda_internal.h:700
virtual void get_bytes(off_t offset, void *dst, size_t size)
Definition cuda_internal.h:392
void pop_context(void)
void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size)
void launch_transpose_kernel(MemcpyTransposeInfo< size_t > &copy_info, size_t elemSize, GPUStream *stream)
void create_dynamic_fb_memory(RuntimeImpl *runtime, size_t max_size)
CUdeviceptr fbmem_base
Definition cuda_internal.h:482
GPUFuncInfo fill_affine_large_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:479
std::vector< CudaIpcMapping > cudaipc_mappings
Definition cuda_internal.h:519
void launch_batch_affine_kernel(void *copy_info, size_t dim, size_t elemSize, size_t volume, bool multified_optimized, GPUStream *stream)
bool can_access_peer(const GPU *peer) const
GPUFBMemory * fbmem
Definition cuda_internal.h:449
GPUFuncInfo transpose_kernels[CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:480
std::unordered_map< ReductionOpID, GPUReductionOpEntry > gpu_reduction_table
Definition cuda_internal.h:539
REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUStream * get_null_task_stream(void) const
GPUStream * device_to_host_stream
Definition cuda_internal.h:497
GPUProcessor * proc
Definition cuda_internal.h:446
GPUFuncInfo batch_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:473
std::vector< GPUStream * > task_streams
Definition cuda_internal.h:501
ContextSynchronizer ctxsync
Definition cuda_internal.h:442
CUmodule device_module
Definition cuda_internal.h:455
void create_processor(RuntimeImpl *runtime, size_t stack_size)
Mutex alloc_mutex
Definition cuda_internal.h:521
std::set< Memory > managed_mems
Definition cuda_internal.h:490
GPUStream * host_to_device_stream
Definition cuda_internal.h:496
GPUStream * get_next_d2d_stream()
GPUFuncInfo indirect_copy_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:472
GPUFuncInfo multi_batch_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:475
GPUStream * device_to_device_stream
Definition cuda_internal.h:498
size_t cupti_activity_refcount
Definition cuda_internal.h:504
std::map< NodeID, GPUStream * > cudaipc_streams
Definition cuda_internal.h:520
std::map< CUdeviceptr, GPUAllocation > allocations
Definition cuda_internal.h:448
void push_context(void)
std::set< Memory > pinned_sysmems
Definition cuda_internal.h:487
GPUFuncInfo batch_fill_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:477
static const size_t CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES
Definition cuda_internal.h:469
GPUStream * get_next_task_stream(bool create=false)
void launch_indirect_copy_kernel(void *copy_info, size_t dim, size_t addr_size, size_t field_size, size_t volume, GPUStream *stream)
CUmodule load_cuda_module(const void *data)
int least_stream_priority
Definition cuda_internal.h:510
CUdeviceptr fb_ibmem_base
Definition cuda_internal.h:484
bool register_reduction(ReductionOpID redop_id, CUfunction apply_excl, CUfunction apply_nonexcl, CUfunction fold_excl, CUfunction fold_nonexcl, CUfunction apply_excl_advanced, CUfunction apply_nonexcl_advanced, CUfunction fold_excl_advanced, CUfunction fold_nonexcl_advanced, CUfunction apply_excl_transpose, CUfunction apply_nonexcl_transpose, CUfunction fold_excl_transpose, CUfunction fold_nonexcl_transpose)
GPUAllocation & add_allocation(GPUAllocation &&alloc)
GPUDynamicFBMemory * fb_dmem
Definition cuda_internal.h:450
void launch_batch_affine_fill_kernel(void *fill_info, size_t dim, size_t elemSize, size_t volume, GPUStream *stream)
int greatest_stream_priority
Definition cuda_internal.h:510
const CudaIpcMapping * find_ipc_mapping(Memory mem) const
GPUEventPool event_pool
Definition cuda_internal.h:506
CUcontext context
Definition cuda_internal.h:453
GPU(CudaModule *_module, GPUInfo *_info, GPUWorker *worker, CUcontext _context)
std::vector< GPUStream * > device_to_device_streams
Definition cuda_internal.h:499
GPUFBIBMemory * fb_ibmem
Definition cuda_internal.h:451
atomic< unsigned > next_task_stream
Definition cuda_internal.h:502
GPUStream * find_stream(CUstream stream) const
bool is_accessible_host_mem(const MemoryImpl *mem) const
GPUInfo * info
Definition cuda_internal.h:444
std::vector< GPUStream * > peer_to_peer_streams
Definition cuda_internal.h:500
GPUWorker * worker
Definition cuda_internal.h:445
void create_dma_channels(Realm::RuntimeImpl *r)
std::set< Memory > peer_fbs
Definition cuda_internal.h:493
atomic< unsigned > next_d2d_stream
Definition cuda_internal.h:503
bool is_accessible_gpu_mem(const MemoryImpl *mem) const
Definition cuda_internal.h:1003
static const bool is_ordered
Definition cuda_internal.h:1008
GPUfillChannel(GPU *_gpu, BackgroundWorkManager *bgwork)
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
long submit(Request **requests, long nr)
GPU * gpu
Definition cuda_internal.h:1023
Definition cuda_internal.h:988
GPUfillXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, const void *_fill_data, size_t _fill_size, size_t _fill_total)
size_t reduced_fill_size
Definition cuda_internal.h:1000
long get_requests(Request **requests, long nr)
bool progress_xd(GPUfillChannel *channel, TimeLimit work_until)
Definition cuda_internal.h:1072
GPU * gpu
Definition cuda_internal.h:1096
RemoteChannelInfo * construct_remote_info() const override
static const bool is_ordered
Definition cuda_internal.h:1077
XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total) override
long submit(Request **requests, long nr) override
bool supports_redop(ReductionOpID redop_id) const override
GPUreduceChannel(GPU *_gpu, BackgroundWorkManager *bgwork)
Definition cuda_internal.h:1099
GPUreduceRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths)
bool serialize(S &serializer) const
virtual RemoteChannel * create_remote_channel()
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPUreduceRemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:1115
static RemoteChannelInfo * deserialize_new(S &deserializer)
Definition cuda_internal.h:1118
Definition cuda_internal.h:1033
long get_requests(Request **requests, long nr)
const void * kernel_host_proxy_advanced
Definition cuda_internal.h:1065
std::vector< bool > src_is_ipc
Definition cuda_internal.h:1069
GPUStream * stream
Definition cuda_internal.h:1067
void record_redop_advanced_kernel(GPU *gpu)
GPUreduceXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, XferDesRedopInfo _redop_info)
const void * kernel_host_proxy
Definition cuda_internal.h:1064
bool progress_xd(GPUreduceChannel *channel, TimeLimit work_until)
const void * kernel_host_proxy_transpose
Definition cuda_internal.h:1066
CUfunction kernel
Definition cuda_internal.h:1061
XferDesRedopInfo redop_info
Definition cuda_internal.h:1059
std::vector< GPU * > src_gpus
Definition cuda_internal.h:1068
bool fast_reduction_kernel_mode(GPUreduceChannel *channel, const size_t max_bytes, XferPort *in_port, XferPort *out_port, const size_t in_span_start, const size_t out_span_start)
const ReductionOpUntyped * redop
Definition cuda_internal.h:1060
KernelVariantDesc describe_kernel_variant(GPU *cpu, bool is_advanced)
CUfunction kernel_transpose
Definition cuda_internal.h:1063
void setup_redop_kernel(GPUreduceChannel *channel, void *params, const size_t in_span_start, const size_t out_span_start, const size_t in_elem_size, const size_t out_elem_size, const size_t elems, const bool has_transpose)
bool resolve_kernel_slot(GPU *gpu, void *host_proxy, CUfunction &kernel_out, CUfunction GPU::GPUReductionOpEntry::*cache_field)
CUfunction kernel_advanced
Definition cuda_internal.h:1062
Definition cuda_internal.h:773
CUarray array
Definition cuda_internal.h:778
Definition threads.h:428
Definition instance.h:405
Definition ib_memory.h:30
Definition indexspace.h:1115
Definition inst_layout.h:267
Definition tasks.h:181
Definition lists.h:66
Definition mem_impl.h:344
Definition proc_impl.h:141
Definition mem_impl.h:212
Definition mem_impl.h:50
MemoryKind
Definition mem_impl.h:53
size_t size
Definition mem_impl.h:195
AllocationResult
Definition mem_impl.h:89
Definition memory.h:33
Kind
Definition memory.h:59
Definition module.h:100
Definition network.h:262
Definition operation.h:75
Operation * op
Definition operation.h:87
Definition operation.h:32
Definition processor.h:37
::realm_task_func_id_t TaskFuncID
Definition processor.h:58
Definition inst_impl.h:54
Definition channel.h:896
Definition channel.h:939
Definition repl_heap.h:50
Definition channel.h:103
Definition runtime_impl.h:265
Definition channel.h:909
Definition channel.h:1019
Definition tasks.h:199
Definition tasks.h:41
Definition threads.h:89
Definition timers.h:129
Definition mutex.h:325
Definition mutex.h:223
Definition channel.h:286
Channel * channel
Definition channel.h:343
Definition atomics.h:31
Definition utils.h:84
#define REALM_INTERNAL_API_EXTERNAL_LINKAGE
Definition compiler_support.h:218
#define CUDA_DRIVER_APIS(__op__)
Definition cuda_internal.h:1409
#define NVML_APIS(__op__)
Definition cuda_internal.h:1550
#define DECL_FNPTR_EXTERN(name, ver)
Definition cuda_internal.h:1511
#define CUPTI_APIS(__op__)
Definition cuda_internal.h:1568
#define cudaDeviceProp
Definition hip_cuda.h:24
#define REALM_PMTA_USE(structtype, name)
Definition lists.h:42
CudaModule * cuda_module_singleton
CUresult cuGetProcAddress(const char *, void **, int, int)
CUresult cuCtxRecordEvent(CUcontext hctx, CUevent event)
GPUMemcpyKind
Definition cuda_internal.h:162
@ GPU_MEMCPY_PEER_TO_PEER
Definition cuda_internal.h:166
@ GPU_MEMCPY_HOST_TO_DEVICE
Definition cuda_internal.h:163
@ GPU_MEMCPY_DEVICE_TO_HOST
Definition cuda_internal.h:164
@ GPU_MEMCPY_DEVICE_TO_DEVICE
Definition cuda_internal.h:165
nvmlReturn_t nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType)
nvmlIntNvLinkDeviceType_enum
Definition cuda_internal.h:1538
@ NVML_NVLINK_DEVICE_TYPE_IBMNPU
Definition cuda_internal.h:1540
@ NVML_NVLINK_DEVICE_TYPE_SWITCH
Definition cuda_internal.h:1541
@ NVML_NVLINK_DEVICE_TYPE_UNKNOWN
Definition cuda_internal.h:1542
@ NVML_NVLINK_DEVICE_TYPE_GPU
Definition cuda_internal.h:1539
enum Realm::Cuda::nvmlIntNvLinkDeviceType_enum nvmlIntNvLinkDeviceType_t
Definition activemsg.h:38
int NodeID
Definition nodeset.h:40
XferDesKind
Definition channel.h:85
int CustomSerdezID
Definition custom_serdez.h:148
int OsHandle
Definition utils.h:399
unsigned long long XferDesID
Definition channel.h:57
::realm_reduction_op_id_t ReductionOpID
Definition event.h:38
#define REALM_MAX_DIM
Definition realm_config.h:34
Definition channel.h:684
Definition cuda_memcpy.h:102
Definition cuda_internal.h:1125
static void handle_message(NodeID sender, const CudaIpcImportRequest &args, const void *data, size_t datalen)
unsigned count
Definition cuda_internal.h:1126
long hostid
Definition cuda_internal.h:1128
Definition cudart_hijack.h:53
Definition cuda_internal.h:127
int pci_busid
Definition cuda_internal.h:141
CUdevice device
Definition cuda_internal.h:129
size_t pci_bandwidth
Definition cuda_internal.h:145
std::vector< size_t > logical_peer_bandwidth
Definition cuda_internal.h:148
int pci_domainid
Definition cuda_internal.h:142
CUuuid uuid
Definition cuda_internal.h:131
std::set< CUdevice > peers
Definition cuda_internal.h:140
bool has_numa_preference
Definition cuda_internal.h:138
bool pageable_access_supported
Definition cuda_internal.h:154
std::vector< size_t > logical_peer_latency
Definition cuda_internal.h:149
bool host_gpu_same_va
Definition cuda_internal.h:147
unsigned fabric_clique
Definition cuda_internal.h:152
bool fabric_supported
Definition cuda_internal.h:151
char name[MAX_NAME_LEN]
Definition cuda_internal.h:135
int major
Definition cuda_internal.h:132
size_t totalGlobalMem
Definition cuda_internal.h:136
int pci_deviceid
Definition cuda_internal.h:143
nvmlDevice_t nvml_dev
Definition cuda_internal.h:130
unsigned long numa_node_affinity[MAX_NUMA_NODE_LEN]
Definition cuda_internal.h:139
size_t c2c_bandwidth
Definition cuda_internal.h:144
int index
Definition cuda_internal.h:128
int minor
Definition cuda_internal.h:133
size_t nvswitch_bandwidth
Definition cuda_internal.h:146
CUuuid fabric_uuid
Definition cuda_internal.h:153
static const size_t MAX_NAME_LEN
Definition cuda_internal.h:134
static const size_t MAX_NUMA_NODE_LEN
Definition cuda_internal.h:137
Definition cuda_internal.h:576
Cuda::StreamAwareTaskFuncPtr stream_aware_fnptr
Definition cuda_internal.h:578
Processor::TaskFuncPtr fnptr
Definition cuda_internal.h:577
ByteArray user_data
Definition cuda_internal.h:579
Definition cuda_internal.h:282
GPUWorkStart * start
Definition cuda_internal.h:285
CUevent event
Definition cuda_internal.h:283
GPUWorkFence * fence
Definition cuda_internal.h:284
GPUCompletionNotification * notification
Definition cuda_internal.h:286
Definition cuda_internal.h:512
uintptr_t address_offset
Definition cuda_internal.h:517
NodeID owner
Definition cuda_internal.h:513
GPU * src_gpu
Definition cuda_internal.h:514
Memory mem
Definition cuda_internal.h:515
uintptr_t local_base
Definition cuda_internal.h:516
Definition cuda_internal.h:457
CUfunction func
Definition cuda_internal.h:458
int occ_num_threads
Definition cuda_internal.h:459
int occ_num_blocks
Definition cuda_internal.h:460
Definition cuda_internal.h:524
CUfunction fold_excl
Definition cuda_internal.h:528
CUfunction fold_nonexcl_transpose
Definition cuda_internal.h:535
CUfunction fold_excl_transpose
Definition cuda_internal.h:536
CUfunction apply_excl_advanced
Definition cuda_internal.h:530
CUfunction apply_excl_transpose
Definition cuda_internal.h:534
CUfunction apply_nonexcl
Definition cuda_internal.h:525
CUfunction apply_nonexcl_advanced
Definition cuda_internal.h:529
CUfunction fold_nonexcl
Definition cuda_internal.h:527
CUfunction apply_excl
Definition cuda_internal.h:526
CUfunction fold_nonexcl_advanced
Definition cuda_internal.h:531
CUfunction fold_excl_advanced
Definition cuda_internal.h:532
CUfunction apply_nonexcl_transpose
Definition cuda_internal.h:533
Definition cuda_internal.h:1028
void * host_proxy
Definition cuda_internal.h:1029
CUfunction GPU::GPUReductionOpEntry::* cache_field
Definition cuda_internal.h:1030
Definition cuda_memcpy.h:114
Definition cudart_hijack.h:65
Definition cudart_hijack.h:76
Definition redop.h:56
Definition channel.h:210
Definition channel.h:300
NodeID src
Definition ucp_internal.h:1