Realm
A distributed, event-based tasking library
Loading...
Searching...
No Matches
cuda_internal.h
Go to the documentation of this file.
1/*
2 * Copyright 2026 Stanford University, NVIDIA Corporation, Los Alamos National Laboratory
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef REALM_CUDA_INTERNAL_H
19#define REALM_CUDA_INTERNAL_H
20
22
23#include <memory>
24#include <unordered_map>
25#if !defined(CUDA_ENABLE_DEPRECATED)
26// Ignore deprecation warnings from cuda headers
27#define CUDA_ENABLE_DEPRECATED 1
28#endif
29#include <cuda.h>
30#include <nvml.h>
31#include <cupti.h>
32#if defined(REALM_USE_CUDART_HIJACK)
33#include <cuda_runtime_api.h> // For cudaDeviceProp
34#endif
35
36// For CUDA runtime's dim3 definition
37#include <vector_types.h>
38
39#include "realm/operation.h"
40#include "realm/threads.h"
41#include "realm/circ_queue.h"
42#include "realm/indexspace.h"
43#include "realm/proc_impl.h"
44#include "realm/mem_impl.h"
45#include "realm/bgwork.h"
49
50#if CUDART_VERSION < 11000
51#define CHECK_CUDART(cmd) \
52 do { \
53 int ret = (int)(cmd); \
54 if(ret != 0) { \
55 fprintf(stderr, "CUDART: %s = %d\n", #cmd, ret); \
56 assert(0); \
57 exit(1); \
58 } \
59 } while(0)
60#else
61// Since CUDA TK11.0, runtime and driver error codes are 1:1 correlated
62#define CHECK_CUDART(cmd) CHECK_CU((CUresult)(cmd))
63#endif
64
65// Need CUDA 6.5 or later for good error reporting
66#if CUDA_VERSION >= 6050
67#define REPORT_CU_ERROR(level, cmd, ret) \
68 do { \
69 const char *name, *str; \
70 CUDA_DRIVER_FNPTR(Realm::Cuda::cuGetErrorName)(ret, &name); \
71 CUDA_DRIVER_FNPTR(Realm::Cuda::cuGetErrorString)(ret, &str); \
72 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret \
73 << '(' << name << "): " << str; \
74 } while(0)
75#else
76#define REPORT_CU_ERROR(level, cmd, ret) \
77 do { \
78 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret \
79 } while(0)
80#endif
81
82#define CHECK_CU(cmd) \
83 do { \
84 CUresult ret = (cmd); \
85 if(ret != CUDA_SUCCESS) { \
86 REPORT_CU_ERROR(Logger::LEVEL_ERROR, #cmd, ret); \
87 abort(); \
88 } \
89 } while(0)
90
91#define REPORT_NVML_ERROR(level, cmd, ret) \
92 do { \
93 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret; \
94 } while(0)
95
96#define CHECK_NVML(cmd) \
97 do { \
98 nvmlReturn_t ret = (cmd); \
99 if(ret != NVML_SUCCESS) { \
100 REPORT_NVML_ERROR(Logger::LEVEL_ERROR, #cmd, ret); \
101 abort(); \
102 } \
103 } while(0)
104
105#define IS_DEFAULT_STREAM(stream) \
106 (((stream) == 0) || ((stream) == CU_STREAM_LEGACY) || \
107 ((stream) == CU_STREAM_PER_THREAD))
108
109#define REPORT_CUPTI_ERROR(level, cmd, ret) \
110 do { \
111 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret; \
112 } while(0)
113
114#define CHECK_CUPTI(cmd) \
115 do { \
116 CUptiResult ret = (cmd); \
117 if(ret != CUPTI_SUCCESS) { \
118 REPORT_CUPTI_ERROR(Logger::LEVEL_ERROR, #cmd, ret); \
119 abort(); \
120 } \
121 } while(0)
122
123namespace Realm {
124
125 namespace Cuda {
126
127 struct GPUInfo {
128 int index; // index used by CUDA runtime
129 CUdevice device;
130 nvmlDevice_t nvml_dev;
131 CUuuid uuid;
132 int major;
133 int minor;
134 static const size_t MAX_NAME_LEN = 256;
137 static const size_t MAX_NUMA_NODE_LEN = 20;
140 std::set<CUdevice> peers; // other GPUs we can do p2p copies with
144 size_t c2c_bandwidth = 0; // Current enabled c2c bandwidth
145 size_t pci_bandwidth = 0; // Current enabled pci-e bandwidth
146 size_t nvswitch_bandwidth = 0; // Current enabled nvswitch bandwidth
147 bool host_gpu_same_va = false;
148 std::vector<size_t> logical_peer_bandwidth;
149 std::vector<size_t> logical_peer_latency;
150 // Fabric information for this gpu
151 bool fabric_supported = false;
152 unsigned fabric_clique = -1U;
153 CUuuid fabric_uuid = {0};
155
156#ifdef REALM_USE_CUDART_HIJACK
157 cudaDeviceProp prop;
158#endif
159 };
160
168
169 // Forard declaration
170 class GPUProcessor;
171 class GPUWorker;
172 class GPUStream;
173 class GPUFBMemory;
174 class GPUDynamicFBMemory;
175 class GPUZCMemory;
176 class GPUFBIBMemory;
177 class GPUAllocation;
178 class GPU;
179 class CudaModule;
180
181 extern CudaModule *cuda_module_singleton;
182
184 public:
186 void *create_context(Task *task) const override;
187 void destroy_context(Task *task, void *context) const override;
188 void *create_context(InternalTask *task) const override;
189 void destroy_context(InternalTask *task, void *context) const override;
190 GPU *gpu = nullptr;
191 GPUProcessor *proc = nullptr; // TODO(cperry): delete me
192 };
193
194 // an interface for receiving completion notification for a GPU operation
195 // (right now, just copies)
197 public:
199
200 virtual void request_completed(void) = 0;
201 };
202
204 public:
207
208 virtual void mark_finished(bool successful);
209
210 virtual void request_cancellation(void);
211
213
214 virtual void print(std::ostream &os) const;
215
219 DummyLock>
221
222 protected:
223 static void cuda_callback(CUstream stream, CUresult res, void *data);
224 GPU *gpu = nullptr;
225 };
226
228 public:
230
231 virtual void request_cancellation(void) { return; };
232
234
235 virtual void print(std::ostream &os) const;
236
238
239 protected:
240 static void cuda_start_callback(CUstream stream, CUresult res, void *data);
241 };
242
243 // a class that represents a CUDA stream and work associated with
244 // it (e.g. queued copies, events in flight)
245 // a stream is also associated with a GPUWorker that it will register
246 // with when async work needs doing
247 class GPUStream {
248 public:
249 GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority = 0);
251
252 GPU *get_gpu(void) const;
254 get_stream(void) const; // needed by librealm_kokkos.so
255
256 // may be called by anybody to enqueue a copy or an event
260 void add_event(CUevent event, GPUWorkFence *fence,
261 GPUCompletionNotification *notification = NULL,
262 GPUWorkStart *start = NULL);
263 void wait_on_streams(const std::set<GPUStream *> &other_streams);
264
265 // atomically checks rate limit counters and returns true if 'bytes'
266 // worth of copies can be submitted or false if not (in which case
267 // the progress counter on the xd will be updated when it should try
268 // again)
269 bool ok_to_submit_copy(size_t bytes, XferDes *xd);
270 bool reap_events(TimeLimit work_until);
271
272 protected:
273 // may only be tested with lock held
274 bool has_work(void) const;
275
278
279 CUstream stream;
280
288#ifdef USE_CQ
290#else
291 std::deque<PendingEvent> pending_events;
292#endif
293 };
294
295 // a GPUWorker is responsible for making progress on one or more GPUStreams -
296 // this may be done directly by a GPUProcessor or in a background thread
297 // spawned for the purpose
299 public:
301 virtual ~GPUWorker(void);
302
303 // adds a stream that has work to be done
305
306 // used to start a dedicate thread (mutually exclusive with being
307 // registered with a background work manager)
310
311 bool do_work(TimeLimit work_until);
312
313 public:
314 void thread_main(void);
315
316 protected:
317 // used by the background thread
318 // processes work on streams, optionally sleeping for work to show up
319 // returns true if work remains to be done
320 bool process_streams(bool sleep_on_empty);
321
324
327
328 // used by the background thread (if any)
333 };
334
335 // a little helper class to manage a pool of CUevents that can be reused
336 // to reduce alloc/destroy overheads
338 public:
339 GPUEventPool(int _batch_size = 256);
340
341 // allocating the initial batch of events and cleaning up are done with
342 // these methods instead of constructor/destructor because we don't
343 // manage the GPU context in this helper class
344 void init_pool(int init_size = 0 /* default == batch size */);
345 void empty_pool(void);
346
347 CUevent get_event(bool external = false);
348 void return_event(CUevent e, bool external = false);
349
350 protected:
353 std::vector<CUevent> available_events;
354 };
355
356 // when the runtime hijack is not enabled/active, a cuCtxSynchronize
357 // is required to ensure a task's completion event covers all of its
358 // actions - rather than blocking an important thread, we create a
359 // small thread pool to handle these
384
385 struct FatBin;
386 struct RegisteredVariable;
387 struct RegisteredFunction;
388
389 // a GPU object represents our use of a given CUDA-capable GPU - this will
390 // have an associated CUDA context, a (possibly shared) worker thread, a
391 // processor, and an FB memory (the ZC memory is shared across all GPUs)
392 class GPU {
393 public:
394 GPU(CudaModule *_module, GPUInfo *_info, GPUWorker *worker, CUcontext _context);
395 ~GPU(void);
396
397 void push_context(void);
398 void pop_context(void);
399
401
402 void create_processor(RuntimeImpl *runtime, size_t stack_size);
403 void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size);
404 void create_dynamic_fb_memory(RuntimeImpl *runtime, size_t max_size);
405
407
408 bool can_access_peer(const GPU *peer) const;
409
410 GPUStream *find_stream(CUstream stream) const;
412 get_null_task_stream(void) const; // needed by librealm_kokkos.so
413 GPUStream *get_next_task_stream(bool create = false);
415
416 void launch_batch_affine_fill_kernel(void *fill_info, size_t dim, size_t elemSize,
417 size_t volume, GPUStream *stream);
418 void launch_batch_affine_kernel(void *copy_info, size_t dim, size_t elemSize,
419 size_t volume, GPUStream *stream);
421 size_t elemSize, GPUStream *stream);
422
423 void launch_indirect_copy_kernel(void *copy_info, size_t dim, size_t addr_size,
424 size_t field_size, size_t volume,
425 GPUStream *stream);
426 bool is_accessible_host_mem(const MemoryImpl *mem) const;
427 bool is_accessible_gpu_mem(const MemoryImpl *mem) const;
428
429 bool register_reduction(ReductionOpID redop_id, CUfunction apply_excl,
430 CUfunction apply_nonexcl, CUfunction fold_excl,
431 CUfunction fold_nonexcl);
432
433 protected:
434 CUmodule load_cuda_module(const void *data);
435
436 public:
438 CudaModule *module = nullptr;
439 GPUInfo *info = nullptr;
440 GPUWorker *worker = nullptr;
441 GPUProcessor *proc = nullptr;
442
443 std::map<CUdeviceptr, GPUAllocation> allocations;
444 GPUFBMemory *fbmem = nullptr;
447
448 CUcontext context = nullptr;
449
450 CUmodule device_module = nullptr;
451
452 struct GPUFuncInfo {
453 CUfunction func;
456 };
457
458 // The maximum value of log2(type_bytes) that cuda kernels handle.
459 // log2(1 byte) --> 0
460 // log2(2 bytes) --> 1
461 // log2(4 bytes) --> 2
462 // log2(8 bytes) --> 3
463 // log2(16 bytes) --> 4
464 static const size_t CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES = 5;
465
474
475 CUdeviceptr fbmem_base = 0;
476
477 CUdeviceptr fb_ibmem_base = 0;
478
479 // which system memories have been registered and can be used for cuMemcpyAsync
480 std::set<Memory> pinned_sysmems;
481
482 // managed memories we can concurrently access
483 std::set<Memory> managed_mems;
484
485 // which other FBs we have peer access to
486 std::set<Memory> peer_fbs;
487
488 // streams for different copy types and a pile for actual tasks
492 std::vector<GPUStream *> device_to_device_streams;
493 std::vector<GPUStream *> peer_to_peer_streams; // indexed by target
494 std::vector<GPUStream *> task_streams;
498
500
501 // this can technically be different in each context (but probably isn't
502 // in practice)
504
509 uintptr_t local_base;
510 uintptr_t address_offset; // add to convert from original to local base
511 };
512 std::vector<CudaIpcMapping> cudaipc_mappings;
513 std::map<NodeID, GPUStream *> cudaipc_streams;
516
518 CUfunction apply_nonexcl = nullptr;
519 CUfunction apply_excl = nullptr;
520 CUfunction fold_nonexcl = nullptr;
521 CUfunction fold_excl = nullptr;
522 };
523
524 std::unordered_map<ReductionOpID, GPUReductionOpEntry> gpu_reduction_table;
525 };
526
527 // helper to push/pop a GPU's context by scope
529 public:
533
534 protected:
536 };
537
538 class REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUProcessor // needed by librealm_kokkos.so
540 public:
541 GPUProcessor(RuntimeImpl *runtime_impl, GPU *_gpu, Processor _me,
542 Realm::CoreReservationSet &crs, size_t _stack_size);
543 virtual ~GPUProcessor(void);
544
545 public:
546 virtual bool register_task(Processor::TaskFuncID func_id, CodeDescriptor &codedesc,
547 const ByteArrayRef &user_data);
548
549 virtual void shutdown(void);
550
551 protected:
553 const ByteArrayRef &task_args);
554
555 public:
557
558 protected:
560
562 Processor::TaskFuncPtr fnptr;
563 Cuda::StreamAwareTaskFuncPtr stream_aware_fnptr;
565 };
566
567 // we're not using the parent's task table, but we can use the mutex
568 // RWLock task_table_mutex;
569 std::map<Processor::TaskFuncID, GPUTaskTableEntry> gpu_task_table;
570 };
571
572 // this can be attached to any MemoryImpl if the underlying memory is
573 // guaranteed to belong to a given CUcontext - this will allow that
574 // context's processor and dma channels to work with it
575 // the creator is expected to know what CUcontext they want but need
576 // not know which GPU object that corresponds to
578 public:
579 CudaDeviceMemoryInfo(CUcontext _context);
580
581 CUcontext context;
583 };
584
586 public:
587 GPUFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base,
588 size_t _size);
589
590 virtual ~GPUFBMemory(void);
591
592 // these work, but they are SLOW
593 virtual void get_bytes(off_t offset, void *dst, size_t size);
594 virtual void put_bytes(off_t offset, const void *src, size_t size);
595
596 virtual void *get_direct_ptr(off_t offset, size_t size);
597
598 // GPUFBMemory supports ExternalCudaMemoryResource and
599 // ExternalCudaArrayResource
601 size_t &inst_offset);
603
604 // for re-registration purposes, generate an ExternalInstanceResource *
605 // (if possible) for a given instance, or a subset of one
608 span<const FieldID> fields, bool read_only);
609
610 public:
612 CUdeviceptr base;
614 };
615
617 public:
618 GPUDynamicFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu,
619 size_t _max_size);
620
621 virtual ~GPUDynamicFBMemory(void);
622 void cleanup(void);
623
624 // deferred allocation not supported
626 bool need_alloc_result,
627 bool poisoned,
628 TimeLimit work_until);
629
630 virtual void release_storage_immediate(RegionInstanceImpl *inst, bool poisoned,
631 TimeLimit work_until);
632
633 // these work, but they are SLOW
634 virtual void get_bytes(off_t offset, void *dst, size_t size);
635 virtual void put_bytes(off_t offset, const void *src, size_t size);
636
637 virtual void *get_direct_ptr(off_t offset, size_t size);
638
639 // GPUDynamicFBMemory supports ExternalCudaMemoryResource and
640 // ExternalCudaArrayResource
642 size_t &inst_offset);
644
645 // for re-registration purposes, generate an ExternalInstanceResource *
646 // (if possible) for a given instance, or a subset of one
649 span<const FieldID> fields, bool read_only);
650
651 public:
654 size_t cur_size;
655 std::map<RegionInstance, std::pair<CUdeviceptr, size_t>> alloc_bases;
657 };
658
660 public:
661 GPUZCMemory(RuntimeImpl *_runtime_impl, GPU *gpu, Memory _me, CUdeviceptr _gpu_base,
662 void *_cpu_base, size_t _size, MemoryKind _kind,
663 Memory::Kind _lowlevel_kind);
664
665 virtual ~GPUZCMemory(void);
666
667 virtual void get_bytes(off_t offset, void *dst, size_t size);
668
669 virtual void put_bytes(off_t offset, const void *src, size_t size);
670
671 virtual void *get_direct_ptr(off_t offset, size_t size);
672
673 // GPUZCMemory supports ExternalCudaPinnedHostResource
675 size_t &inst_offset);
677
678 // for re-registration purposes, generate an ExternalInstanceResource *
679 // (if possible) for a given instance, or a subset of one
682 span<const FieldID> fields, bool read_only);
683
684 public:
685 CUdeviceptr gpu_base;
686 char *cpu_base;
688 };
689
690 class GPUFBIBMemory : public IBMemory {
691 public:
692 GPUFBIBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base,
693 size_t _size);
694
695 public:
697 CUdeviceptr base;
699 };
700
701 class GPURequest;
702
704 public:
706
708 };
709
710 class GPURequest : public Request {
711 public:
712 const void *src_base;
713 void *dst_base;
714 // off_t src_gpu_off, dst_gpu_off;
717 };
718
720 public:
722 XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size,
723 int _write_port_idx, size_t _write_offset, size_t _write_size,
724 int _read_ind_port_idx = -1, size_t _read_ind_offset = 0,
725 size_t _read_ind_size = 0, int _write_ind_port_idx = -1,
726 size_t _write_ind_offset = 0, size_t _write_ind_size = 0);
727
728 virtual void request_completed(void);
729
730 protected:
740 };
741
743 public:
744 GPUTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset,
745 size_t _read_size, int _write_port_idx, size_t _write_offset,
746 size_t _write_size);
747
748 virtual void request_completed(void);
749
750 protected:
756 };
757
759 public:
760 MemSpecificCudaArray(CUarray _array);
762
763 CUarray array;
764 };
765
767 public:
768 virtual int set_rect(const RegionInstanceImpl *inst,
769 const InstanceLayoutPieceBase *piece, size_t field_size,
770 size_t field_offset, int ndims, const int64_t lo[/*ndims*/],
771 const int64_t hi[/*ndims*/], const int order[/*ndims*/]);
772
773 CUarray array;
774 int dim;
775 size_t pos[3];
777 };
778
779 class GPUChannel;
780
781 class GPUXferDes : public XferDes {
782 public:
783 GPUXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,
784 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,
785 const std::vector<XferDesPortInfo> &outputs_info, int _priority);
786
787 long get_requests(Request **requests, long nr);
788
790
791 private:
792 std::vector<GPU *> src_gpus, dst_gpus;
793 std::vector<bool> dst_is_ipc;
794 };
795
796 class GPUIndirectChannel;
797
799 public:
800 GPUIndirectXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,
801 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,
802 const std::vector<XferDesPortInfo> &outputs_info, int _priority,
803 XferDesRedopInfo _redop_info);
804
805 long get_requests(Request **requests, long nr);
807
808 protected:
809 std::vector<GPU *> src_gpus, dst_gpus;
810 std::vector<bool> dst_is_ipc;
811 };
812
814 : public SingleXDQChannel<GPUIndirectChannel, GPUIndirectXferDes> {
815 public:
818
819 // multi-threading of cuda copies for a given device is disabled by
820 // default (can be re-enabled with -cuda:mtdma 1)
821 static const bool is_ordered = true;
822
823 virtual bool needs_wrapping_iterator() const;
825
827
828 virtual uint64_t
829 supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id,
830 CustomSerdezID dst_serdez_id, ReductionOpID redop_id,
831 size_t total_bytes, const std::vector<size_t> *src_frags,
832 const std::vector<size_t> *dst_frags, XferDesKind *kind_ret = 0,
833 unsigned *bw_ret = 0, unsigned *lat_ret = 0);
834
835 virtual bool supports_indirection_memory(Memory mem) const;
836
837 virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,
838 XferDesID guid,
839 const std::vector<XferDesPortInfo> &inputs_info,
840 const std::vector<XferDesPortInfo> &outputs_info,
841 int priority, XferDesRedopInfo redop_info,
842 const void *fill_data, size_t fill_size,
843 size_t fill_total);
844
845 long submit(Request **requests, long nr);
846 GPU *get_gpu() const { return src_gpu; }
847
848 protected:
849 friend class GPUIndirectXferDes;
851 };
852
854 public:
856 uintptr_t _remote_ptr,
857 const std::vector<Channel::SupportedPath> &_paths,
858 const std::vector<Memory> &_indirect_memories);
859
861
862 template <typename S>
863 bool serialize(S &serializer) const;
864
865 template <typename S>
866 static RemoteChannelInfo *deserialize_new(S &deserializer);
867
868 protected:
872 };
873
876
877 public:
878 GPUIndirectRemoteChannel(uintptr_t _remote_ptr,
879 const std::vector<Memory> &_indirect_memories);
881 virtual bool needs_wrapping_iterator() const;
882 virtual uint64_t
883 supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id,
884 CustomSerdezID dst_serdez_id, ReductionOpID redop_id,
885 size_t total_bytes, const std::vector<size_t> *src_frags,
886 const std::vector<size_t> *dst_frags, XferDesKind *kind_ret /*= 0*/,
887 unsigned *bw_ret /*= 0*/, unsigned *lat_ret /*= 0*/);
888 };
889
890 class GPUChannel : public SingleXDQChannel<GPUChannel, GPUXferDes> {
891 public:
894
895 // multi-threading of cuda copies for a given device is disabled by
896 // default (can be re-enabled with -cuda:mtdma 1)
897 static const bool is_ordered = true;
898
899 virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,
900 XferDesID guid,
901 const std::vector<XferDesPortInfo> &inputs_info,
902 const std::vector<XferDesPortInfo> &outputs_info,
903 int priority, XferDesRedopInfo redop_info,
904 const void *fill_data, size_t fill_size,
905 size_t fill_total);
906
907 long submit(Request **requests, long nr);
908 GPU *get_gpu() const { return src_gpu; }
909
910 private:
911 GPU *src_gpu;
912 // std::deque<Request*> pending_copies;
913 };
914
915 class GPUfillChannel;
916
917 class GPUfillXferDes : public XferDes {
918 public:
919 GPUfillXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,
920 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,
921 const std::vector<XferDesPortInfo> &outputs_info, int _priority,
922 const void *_fill_data, size_t _fill_size, size_t _fill_total);
923
924 long get_requests(Request **requests, long nr);
925
927
928 protected:
930 };
931
932 class GPUfillChannel : public SingleXDQChannel<GPUfillChannel, GPUfillXferDes> {
933 public:
935
936 // multiple concurrent cuda fills ok
937 static const bool is_ordered = false;
938
939 virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,
940 XferDesID guid,
941 const std::vector<XferDesPortInfo> &inputs_info,
942 const std::vector<XferDesPortInfo> &outputs_info,
943 int priority, XferDesRedopInfo redop_info,
944 const void *fill_data, size_t fill_size,
945 size_t fill_total);
946
947 long submit(Request **requests, long nr);
948
949 protected:
950 friend class GPUfillXferDes;
951
953 };
954
955 class GPUreduceChannel;
956
957 class GPUreduceXferDes : public XferDes {
958 public:
959 GPUreduceXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,
960 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,
961 const std::vector<XferDesPortInfo> &outputs_info, int _priority,
962 XferDesRedopInfo _redop_info);
963
964 long get_requests(Request **requests, long nr);
965
967
968 protected:
971 CUfunction kernel;
972 const void *kernel_host_proxy;
974 std::vector<GPU *> src_gpus;
975 std::vector<bool> src_is_ipc;
976 };
977
978 class GPUreduceChannel : public SingleXDQChannel<GPUreduceChannel, GPUreduceXferDes> {
979 public:
981
982 // multiple concurrent cuda reduces ok
983 static const bool is_ordered = false;
984
985 // helper method here so that GPUreduceRemoteChannel can use it too
986 bool supports_redop(ReductionOpID redop_id) const override;
987
989
990 XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid,
991 const std::vector<XferDesPortInfo> &inputs_info,
992 const std::vector<XferDesPortInfo> &outputs_info,
993 int priority, XferDesRedopInfo redop_info,
994 const void *fill_data, size_t fill_size,
995 size_t fill_total) override;
996
997 long submit(Request **requests, long nr) override;
998
999 protected:
1000 friend class GPUreduceXferDes;
1001
1003 };
1004
1006 public:
1007 GPUreduceRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr,
1008 const std::vector<Channel::SupportedPath> &_paths);
1009
1011
1012 template <typename S>
1013 bool serialize(S &serializer) const;
1014
1015 template <typename S>
1016 static RemoteChannelInfo *deserialize_new(S &deserializer);
1017
1018 protected:
1022 };
1023
1026
1027 GPUreduceRemoteChannel(uintptr_t _remote_ptr);
1028 };
1029
1030 // active message for establishing cuda ipc mappings
1032 unsigned count = 0;
1033#if !defined(REALM_IS_WINDOWS)
1034 long hostid = 0;
1035#endif
1036 static void handle_message(NodeID sender, const CudaIpcImportRequest &args,
1037 const void *data, size_t datalen);
1038 };
1039
1041 public:
1043
1044 virtual void chunk_created(void *base, size_t bytes);
1045 virtual void chunk_destroyed(void *base, size_t bytes);
1046
1047 protected:
1048 CudaModule *module;
1049 };
1050
1055 public:
1056 // -- Constructors --
1057 GPUAllocation(void) = default;
1058 GPUAllocation(GPUAllocation &&other) noexcept;
1059 GPUAllocation(const GPUAllocation &) = delete;
1061 GPUAllocation &operator=(const GPUAllocation &) = delete;
1063
1064 // --- Accessors ---
1065 inline operator bool(void) const { return dev_ptr != 0; }
1072
1077 inline bool get_ipc_handle(CUipcMemHandle &handle) const
1078 {
1079 if(has_ipc_handle) {
1080 handle = ipc_handle;
1081 }
1082 return has_ipc_handle;
1083 }
1084#if CUDA_VERSION >= 12030
1089 bool get_fabric_handle(CUmemFabricHandle &handle) const;
1090#endif
1099 inline CUdeviceptr get_dptr(void) const { return dev_ptr; }
1102 inline GPU *get_gpu(void) const { return gpu; }
1105 inline size_t get_size(void) const { return size; }
1106
1112 template <typename T = void>
1113 T *get_hptr(void) const
1114 {
1115 return static_cast<T *>(host_ptr);
1116 }
1117
1124
1125 // -- Allocators --
1126
1136 static GPUAllocation *allocate_dev(GPU *gpu, size_t size, bool peer_enabled = true,
1137 bool shareable = true);
1138#if CUDA_VERSION >= 11000
1152 static GPUAllocation *allocate_mmap(GPU *gpu, const CUmemAllocationProp &prop,
1153 size_t size, CUdeviceptr vaddr = 0,
1154 bool peer_enabled = true);
1155#endif
1167 static GPUAllocation *allocate_host(GPU *gpu, size_t size, bool peer_enabled = true,
1168 bool shareable = true, bool same_va = true);
1175 static GPUAllocation *allocate_managed(GPU *gpu, size_t size);
1187 static GPUAllocation *register_allocation(GPU *gpu, void *ptr, size_t size,
1188 bool peer_enabled = true);
1194 static GPUAllocation *open_ipc(GPU *gpu, const CUipcMemHandle &mem_hdl);
1203 static GPUAllocation *open_handle(GPU *gpu, OsHandle hdl, size_t size,
1204 bool peer_enabled = true);
1205#if CUDA_VERSION >= 12030
1216 static GPUAllocation *open_fabric(GPU *gpu, const CUmemFabricHandle &hdl,
1217 size_t size, bool peer_enabled = true,
1218 bool is_local = false);
1219#endif
1220
1221 private:
1222 CUresult map_allocation(GPU *gpu, CUmemGenericAllocationHandle handle, size_t size,
1223 CUdeviceptr va = 0, size_t offset = 0,
1224 bool peer_enabled = false, bool map_host = false);
1225
1226#if CUDA_VERSION >= 11000
1232 static size_t align_size(const CUmemAllocationProp &prop, size_t size);
1233#endif
1234 // -- Deleters --
1235 typedef void (*DeleterCallback)(GPUAllocation &alloc);
1236
1237 // These are helper functions to manage what freeing strategy needs to be used to
1238 // properly free the allocation
1239 static void cuda_malloc_free(GPUAllocation &alloc);
1240 static void cuda_malloc_host_free(GPUAllocation &alloc);
1241 static void cuda_register_free(GPUAllocation &alloc);
1242 static void cuda_ipc_free(GPUAllocation &alloc);
1243#if CUDA_VERSION >= 11000
1244 static void cuda_memmap_free(GPUAllocation &alloc);
1245#endif
1246
1247 // -- Members --
1249 GPU *gpu = nullptr;
1251 CUdeviceptr dev_ptr = 0;
1253 void *host_ptr = nullptr;
1255 size_t size = 0;
1257 DeleterCallback deleter = nullptr;
1258#if CUDA_VERSION >= 11000
1260 CUmemGenericAllocationHandle mmap_handle = 0;
1261 // True if VA needs to be released for cuMemMap'ed memory
1262 // or if the registered memory actually needs to be unregistered
1263 bool owns_va = true;
1264#endif
1266 bool has_ipc_handle = false;
1268 CUipcMemHandle ipc_handle;
1269 };
1270
1271 // Define these APIs locally here if we know the definition isn't in cuda.h. This
1272 // allows us to use this driver function even if it is unavailable to our current
1273 // toolkit
1274
1275#if CUDA_VERSION < 11030
1276#define CU_GET_PROC_ADDRESS_DEFAULT 0
1277 CUresult cuGetProcAddress(const char *, void **, int, int);
1278#endif
1279
1280#if CUDA_VERSION < 12050
1281 CUresult cuCtxRecordEvent(CUcontext hctx, CUevent event);
1282#endif
1283
1284#if CUDA_VERSION >= 13000
1285// Unfortunately, 13.0 violates it's own source compatibility rules versus
1286// cuGetProcAddress, so fix that ourselves here.
1287#if !defined(cuCtxGetDevice)
1288#define cuCtxGetDevice cuCtxGetDevice_v2
1289#endif
1290#if !defined(cuCtxSynchronize)
1291#define cuCtxSynchronize cuCtxSynchronize_v2
1292#endif
1293#if !defined(cuStreamGetCtx)
1294#define cuStreamGetCtx cuStreamGetCtx_v2
1295#endif
1296#endif
1297
1298 // cuda driver and/or runtime entry points
1299#define CUDA_DRIVER_HAS_FNPTR(name) ((name##_fnptr) != nullptr)
1300#define CUDA_DRIVER_FNPTR(name) (assert(name##_fnptr != nullptr), name##_fnptr)
1301
1302// Only APIs that are available in the minimum base driver version that Realm supports
1303// should be listed here
1304
1305// Note: it is imperative for APIs introduced in minor versions after
1306// the minimum version defined above to explicitly denote the version they were
1307// introduced, otherwise it is possible to retrieve the wrong API and crash when called.
1308
1309// The mininum base driver version Realm supports
1310#define CUDA_VERSION_MIN 11080
1311// Source compatible version of cuda.h (the minimum version where the decltype(&fn)
1312// matches the function returned from cuGetProcAddress(fn, CUDA_VERSION_COMPAT) )
1313#define CUDA_VERSION_COMPAT ((CUDA_VERSION / 1000) * 1000)
1314
1315#define CUDA_DRIVER_APIS(__op__) \
1316 __op__(cuModuleGetFunction, CUDA_VERSION_MIN); \
1317 __op__(cuCtxGetDevice, CUDA_VERSION_MIN); \
1318 __op__(cuCtxEnablePeerAccess, CUDA_VERSION_MIN); \
1319 __op__(cuCtxGetFlags, CUDA_VERSION_MIN); \
1320 __op__(cuCtxGetStreamPriorityRange, CUDA_VERSION_MIN); \
1321 __op__(cuCtxPopCurrent, CUDA_VERSION_MIN); \
1322 __op__(cuCtxPushCurrent, CUDA_VERSION_MIN); \
1323 __op__(cuCtxSynchronize, CUDA_VERSION_MIN); \
1324 __op__(cuDeviceCanAccessPeer, CUDA_VERSION_MIN); \
1325 __op__(cuDeviceGet, CUDA_VERSION_MIN); \
1326 __op__(cuDeviceGetUuid, CUDA_VERSION_MIN); \
1327 __op__(cuDeviceGetAttribute, CUDA_VERSION_MIN); \
1328 __op__(cuDeviceGetCount, CUDA_VERSION_MIN); \
1329 __op__(cuDeviceGetName, CUDA_VERSION_MIN); \
1330 __op__(cuDevicePrimaryCtxRelease, CUDA_VERSION_MIN); \
1331 __op__(cuDevicePrimaryCtxRetain, CUDA_VERSION_MIN); \
1332 __op__(cuDevicePrimaryCtxSetFlags, CUDA_VERSION_MIN); \
1333 __op__(cuDeviceTotalMem, CUDA_VERSION_MIN); \
1334 __op__(cuEventCreate, CUDA_VERSION_MIN); \
1335 __op__(cuEventDestroy, CUDA_VERSION_MIN); \
1336 __op__(cuEventQuery, CUDA_VERSION_MIN); \
1337 __op__(cuEventRecord, CUDA_VERSION_MIN); \
1338 __op__(cuGetErrorName, CUDA_VERSION_MIN); \
1339 __op__(cuGetErrorString, CUDA_VERSION_MIN); \
1340 __op__(cuInit, CUDA_VERSION_MIN); \
1341 __op__(cuIpcCloseMemHandle, CUDA_VERSION_MIN); \
1342 __op__(cuIpcGetMemHandle, CUDA_VERSION_MIN); \
1343 __op__(cuIpcOpenMemHandle, CUDA_VERSION_MIN); \
1344 __op__(cuLaunchKernel, CUDA_VERSION_MIN); \
1345 __op__(cuMemAllocManaged, CUDA_VERSION_MIN); \
1346 __op__(cuMemAlloc, CUDA_VERSION_MIN); \
1347 __op__(cuMemcpy2DAsync, CUDA_VERSION_MIN); \
1348 __op__(cuMemcpy3DAsync, CUDA_VERSION_MIN); \
1349 __op__(cuMemcpyAsync, CUDA_VERSION_MIN); \
1350 __op__(cuMemcpyDtoDAsync, CUDA_VERSION_MIN); \
1351 __op__(cuMemcpyDtoH, CUDA_VERSION_MIN); \
1352 __op__(cuMemcpyDtoHAsync, CUDA_VERSION_MIN); \
1353 __op__(cuMemcpyHtoD, CUDA_VERSION_MIN); \
1354 __op__(cuMemcpyHtoDAsync, CUDA_VERSION_MIN); \
1355 __op__(cuMemFreeHost, CUDA_VERSION_MIN); \
1356 __op__(cuMemFree, CUDA_VERSION_MIN); \
1357 __op__(cuMemGetInfo, CUDA_VERSION_MIN); \
1358 __op__(cuMemHostAlloc, CUDA_VERSION_MIN); \
1359 __op__(cuMemHostGetDevicePointer, CUDA_VERSION_MIN); \
1360 __op__(cuMemHostRegister, CUDA_VERSION_MIN); \
1361 __op__(cuMemHostUnregister, CUDA_VERSION_MIN); \
1362 __op__(cuMemsetD16Async, CUDA_VERSION_MIN); \
1363 __op__(cuMemsetD2D16Async, CUDA_VERSION_MIN); \
1364 __op__(cuMemsetD2D32Async, CUDA_VERSION_MIN); \
1365 __op__(cuMemsetD2D8Async, CUDA_VERSION_MIN); \
1366 __op__(cuMemsetD32Async, CUDA_VERSION_MIN); \
1367 __op__(cuMemsetD8Async, CUDA_VERSION_MIN); \
1368 __op__(cuModuleLoadDataEx, CUDA_VERSION_MIN); \
1369 __op__(cuStreamAddCallback, CUDA_VERSION_MIN); \
1370 __op__(cuStreamCreate, CUDA_VERSION_MIN); \
1371 __op__(cuStreamCreateWithPriority, CUDA_VERSION_MIN); \
1372 __op__(cuStreamDestroy, CUDA_VERSION_MIN); \
1373 __op__(cuStreamSynchronize, CUDA_VERSION_MIN); \
1374 __op__(cuOccupancyMaxPotentialBlockSize, CUDA_VERSION_MIN); \
1375 __op__(cuOccupancyMaxPotentialBlockSizeWithFlags, CUDA_VERSION_MIN); \
1376 __op__(cuEventSynchronize, CUDA_VERSION_MIN); \
1377 __op__(cuEventElapsedTime, CUDA_VERSION_MIN); \
1378 __op__(cuOccupancyMaxActiveBlocksPerMultiprocessor, CUDA_VERSION_MIN); \
1379 __op__(cuMemAddressReserve, CUDA_VERSION_MIN); \
1380 __op__(cuMemAddressFree, CUDA_VERSION_MIN); \
1381 __op__(cuMemCreate, CUDA_VERSION_MIN); \
1382 __op__(cuMemRelease, CUDA_VERSION_MIN); \
1383 __op__(cuMemMap, CUDA_VERSION_MIN); \
1384 __op__(cuMemUnmap, CUDA_VERSION_MIN); \
1385 __op__(cuMemSetAccess, CUDA_VERSION_MIN); \
1386 __op__(cuMemGetAllocationGranularity, CUDA_VERSION_MIN); \
1387 __op__(cuMemGetAllocationPropertiesFromHandle, CUDA_VERSION_MIN); \
1388 __op__(cuMemExportToShareableHandle, CUDA_VERSION_MIN); \
1389 __op__(cuMemImportFromShareableHandle, CUDA_VERSION_MIN); \
1390 __op__(cuStreamWaitEvent, CUDA_VERSION_MIN); \
1391 __op__(cuStreamQuery, CUDA_VERSION_MIN); \
1392 __op__(cuMemGetAddressRange, CUDA_VERSION_MIN); \
1393 __op__(cuPointerGetAttributes, CUDA_VERSION_MIN); \
1394 __op__(cuDriverGetVersion, CUDA_VERSION_MIN); \
1395 __op__(cuMemAdvise, CUDA_VERSION_MIN); \
1396 __op__(cuMemPrefetchAsync, CUDA_VERSION_MIN); \
1397 __op__(cuCtxSetSharedMemConfig, CUDA_VERSION_MIN); \
1398 __op__(cuCtxSetCacheConfig, CUDA_VERSION_MIN); \
1399 __op__(cuCtxSetLimit, CUDA_VERSION_MIN); \
1400 __op__(cuCtxGetLimit, CUDA_VERSION_MIN); \
1401 __op__(cuFuncSetAttribute, CUDA_VERSION_MIN); \
1402 __op__(cuFuncSetCacheConfig, CUDA_VERSION_MIN); \
1403 __op__(cuFuncSetSharedMemConfig, CUDA_VERSION_MIN); \
1404 __op__(cuFuncGetAttribute, CUDA_VERSION_MIN); \
1405 __op__(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, CUDA_VERSION_MIN); \
1406 __op__(cuArray3DCreate, CUDA_VERSION_MIN); \
1407 __op__(cuArrayDestroy, CUDA_VERSION_MIN); \
1408 __op__(cuSurfObjectCreate, CUDA_VERSION_MIN); \
1409 __op__(cuSurfObjectDestroy, CUDA_VERSION_MIN); \
1410 __op__(cuLaunchCooperativeKernel, CUDA_VERSION_MIN); \
1411 __op__(cuModuleGetGlobal, CUDA_VERSION_MIN); \
1412 __op__(cuLaunchHostFunc, CUDA_VERSION_MIN); \
1413 __op__(cuCtxRecordEvent, 12050); \
1414 __op__(cuArrayGetMemoryRequirements, CUDA_VERSION_MIN);
1415
1416// Make sure to only use decltype, to ensure it matches the cuda.h definition
1417#define DECL_FNPTR_EXTERN(name, ver) extern decltype(&name) name##_fnptr;
1419#undef DECL_FNPTR_EXTERN
1420
1421#define NVML_FNPTR(name) (name##_fnptr)
1422
1423#if NVML_API_VERSION >= 11
1424#define NVML_11_APIS(__op__) __op__(nvmlDeviceGetMemoryAffinity);
1425#else
1426#define NVML_11_APIS(__op__)
1427#endif
1428
1429#if NVML_API_VERSION >= 12
1430#define NVML_12_APIS(__op__) __op__(nvmlDeviceGetGpuFabricInfo)
1431#else
1432#define NVML_12_APIS(__op__)
1433#endif
1434
1435#if CUDA_VERSION < 11040
1436 // Define an NVML api that doesn't exist prior to CUDA Toolkit 11.5, but should
1437 // exist in systems that require it that we need to support (we'll detect it's
1438 // availability later)
1439 //
1440 // Although these are NVML apis, NVML_API_VERSION doesn't support any way to detect
1441 // minor versioning, so we'll use the cuda header's versioning here, which should
1442 // coincide with the versions we're looking for
1450
1451 nvmlReturn_t
1452 nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link,
1453 nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType);
1454#endif
1455
1456#define NVML_APIS(__op__) \
1457 __op__(nvmlInit); \
1458 __op__(nvmlDeviceGetHandleByUUID); \
1459 __op__(nvmlDeviceGetMaxPcieLinkWidth); \
1460 __op__(nvmlDeviceGetMaxPcieLinkGeneration); \
1461 __op__(nvmlDeviceGetNvLinkState); \
1462 __op__(nvmlDeviceGetNvLinkVersion); \
1463 __op__(nvmlDeviceGetNvLinkRemotePciInfo); \
1464 __op__(nvmlDeviceGetNvLinkRemoteDeviceType); \
1465 __op__(nvmlDeviceGetDeviceHandleFromMigDeviceHandle); \
1466 __op__(nvmlDeviceGetFieldValues); \
1467 NVML_11_APIS(__op__); \
1468 NVML_12_APIS(__op__);
1469
1470#define DECL_FNPTR_EXTERN(name) extern decltype(&name) name##_fnptr;
1472#undef DECL_FNPTR_EXTERN
1473
1474#define CUPTI_APIS(__op__) \
1475 __op__(cuptiActivityRegisterCallbacks); \
1476 __op__(cuptiActivityEnable); \
1477 __op__(cuptiActivityDisable); \
1478 __op__(cuptiActivityEnableContext); \
1479 __op__(cuptiActivityDisableContext); \
1480 __op__(cuptiActivityFlushAll); \
1481 __op__(cuptiActivityGetNextRecord); \
1482 __op__(cuptiActivityRegisterTimestampCallback); \
1483 __op__(cuptiActivityPushExternalCorrelationId); \
1484 __op__(cuptiActivityPopExternalCorrelationId);
1485
1486#define DECL_FNPTR_EXTERN(name) extern decltype(&name) name##_fnptr;
1488#undef DECL_FNPTR_EXTERN
1489
1490#define CUPTI_HAS_FNPTR(name) (name##_fnptr != nullptr)
1491#define CUPTI_FNPTR(name) (assert(name##_fnptr != nullptr), name##_fnptr)
1492
1493 }; // namespace Cuda
1494
1495}; // namespace Realm
1496
1497#endif
bootstrap_handle_t * handle
Definition bootstrap.h:61
Definition bgwork.h:129
Definition bgwork.h:36
Definition bytearray.h:30
Definition bytearray.h:53
Definition channel.h:713
Definition circ_queue.h:35
Definition codedesc.h:249
Definition threads.h:382
Definition threads.h:342
Definition cuda_internal.h:766
int dim
Definition cuda_internal.h:774
CUarray array
Definition cuda_internal.h:773
size_t width_in_bytes
Definition cuda_internal.h:776
size_t height
Definition cuda_internal.h:776
size_t pos[3]
Definition cuda_internal.h:775
size_t depth
Definition cuda_internal.h:776
virtual int set_rect(const RegionInstanceImpl *inst, const InstanceLayoutPieceBase *piece, size_t field_size, size_t field_offset, int ndims, const int64_t lo[], const int64_t hi[], const int order[])
Definition cuda_internal.h:528
GPU * gpu
Definition cuda_internal.h:535
Definition cuda_internal.h:360
Mutex mutex
Definition cuda_internal.h:376
std::vector< Thread * > worker_threads
Definition cuda_internal.h:381
int total_threads
Definition cuda_internal.h:380
CoreReservation * core_rsrv
Definition cuda_internal.h:382
int max_threads
Definition cuda_internal.h:375
void add_fence(GPUWorkFence *fence)
ContextSynchronizer(GPU *_gpu, CUcontext _context, CoreReservationSet &crs, int _max_threads)
GPU * gpu
Definition cuda_internal.h:373
int syncing_threads
Definition cuda_internal.h:380
Mutex::CondVar condvar
Definition cuda_internal.h:377
CUcontext context
Definition cuda_internal.h:374
int sleeping_threads
Definition cuda_internal.h:380
bool shutdown_flag
Definition cuda_internal.h:378
GPUWorkFence::FenceList fences
Definition cuda_internal.h:379
Definition cuda_internal.h:577
CudaDeviceMemoryInfo(CUcontext _context)
GPU * gpu
Definition cuda_internal.h:582
CUcontext context
Definition cuda_internal.h:581
Definition cuda_module.h:165
Class for managing the lifetime of a given gpu allocation. As instances of this class own an underlyi...
Definition cuda_internal.h:1054
static GPUAllocation * open_handle(GPU *gpu, OsHandle hdl, size_t size, bool peer_enabled=true)
Retrieves the GPUAllocation given the OsHandle.
static GPUAllocation * register_allocation(GPU *gpu, void *ptr, size_t size, bool peer_enabled=true)
Create an allocation that registers the given CPU address range with CUDA, making it accessible from ...
static GPUAllocation * allocate_host(GPU *gpu, size_t size, bool peer_enabled=true, bool shareable=true, bool same_va=true)
Allocate CPU-located memory for the given gpu with the given size and features.
T * get_hptr(void) const
Retrieves the CPU accessible base address for the allocation, or nullptr if there is no way to access...
Definition cuda_internal.h:1113
static GPUAllocation * allocate_managed(GPU *gpu, size_t size)
Allocate migratable memory that can be used with CUDA's managed memory APIs (cuMemPrefetchAsync,...
GPUAllocation(const GPUAllocation &)=delete
size_t get_size(void) const
Retrieves the given size of the allocation.
Definition cuda_internal.h:1105
OsHandle get_os_handle(void) const
Accessor for the file descriptor or win32 HANDLE associated with the allocation. This handle can be s...
bool get_ipc_handle(CUipcMemHandle &handle) const
Retrieves the CUipcMemHandle for this allocation that can be used with GPUAllocation::open_ipc.
Definition cuda_internal.h:1077
static GPUAllocation * allocate_dev(GPU *gpu, size_t size, bool peer_enabled=true, bool shareable=true)
Allocates device-located memory for the given gpu with the given size and features.
GPUAllocation & operator=(GPUAllocation &&) noexcept
static void * get_win32_shared_attributes(void)
Retrieves the default win32 shared attributes for creating a shared object that can be set in CUmemAl...
static GPUAllocation * open_ipc(GPU *gpu, const CUipcMemHandle &mem_hdl)
Retrieves the GPUAllocation given the CUipcMemHandle.
GPUAllocation(GPUAllocation &&other) noexcept
CUdeviceptr get_dptr(void) const
Retrieves the base CUdeviceptr for the associated allocation that can be used to access the underlyin...
Definition cuda_internal.h:1099
GPU * get_gpu(void) const
Retrieves the owning GPU.
Definition cuda_internal.h:1102
Definition cuda_internal.h:890
GPUChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork)
static const bool is_ordered
Definition cuda_internal.h:897
long submit(Request **requests, long nr)
GPU * get_gpu() const
Definition cuda_internal.h:908
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
Definition cuda_internal.h:703
GPURequest * req
Definition cuda_internal.h:707
Definition cuda_internal.h:196
virtual ~GPUCompletionNotification(void)
Definition cuda_internal.h:198
virtual void request_completed(void)=0
Definition cuda_internal.h:183
void destroy_context(InternalTask *task, void *context) const override
void destroy_context(Task *task, void *context) const override
GPU * gpu
Definition cuda_internal.h:190
void * create_context(Task *task) const override
GPUContextManager(GPU *_gpu, GPUProcessor *proc)
void * create_context(InternalTask *task) const override
GPUProcessor * proc
Definition cuda_internal.h:191
Definition cuda_internal.h:616
GPU * gpu
Definition cuda_internal.h:652
virtual void release_storage_immediate(RegionInstanceImpl *inst, bool poisoned, TimeLimit work_until)
size_t cur_size
Definition cuda_internal.h:654
NetworkSegment local_segment
Definition cuda_internal.h:656
virtual void * get_direct_ptr(off_t offset, size_t size)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
virtual AllocationResult allocate_storage_immediate(RegionInstanceImpl *inst, bool need_alloc_result, bool poisoned, TimeLimit work_until)
GPUDynamicFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, size_t _max_size)
Mutex mutex
Definition cuda_internal.h:653
std::map< RegionInstance, std::pair< CUdeviceptr, size_t > > alloc_bases
Definition cuda_internal.h:655
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void get_bytes(off_t offset, void *dst, size_t size)
virtual void put_bytes(off_t offset, const void *src, size_t size)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
Definition cuda_internal.h:337
CUevent get_event(bool external=false)
Mutex mutex
Definition cuda_internal.h:351
void init_pool(int init_size=0)
int batch_size
Definition cuda_internal.h:352
void return_event(CUevent e, bool external=false)
std::vector< CUevent > available_events
Definition cuda_internal.h:353
GPUEventPool(int _batch_size=256)
int total_size
Definition cuda_internal.h:352
int current_size
Definition cuda_internal.h:352
int external_count
Definition cuda_internal.h:352
Definition cuda_internal.h:690
GPUFBIBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base, size_t _size)
NetworkSegment local_segment
Definition cuda_internal.h:698
CUdeviceptr base
Definition cuda_internal.h:697
GPU * gpu
Definition cuda_internal.h:696
Definition cuda_internal.h:585
NetworkSegment local_segment
Definition cuda_internal.h:613
GPU * gpu
Definition cuda_internal.h:611
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void * get_direct_ptr(off_t offset, size_t size)
CUdeviceptr base
Definition cuda_internal.h:612
virtual void put_bytes(off_t offset, const void *src, size_t size)
GPUFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base, size_t _size)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
virtual void get_bytes(off_t offset, void *dst, size_t size)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
virtual ~GPUFBMemory(void)
Definition cuda_internal.h:814
GPU * get_gpu() const
Definition cuda_internal.h:846
static const bool is_ordered
Definition cuda_internal.h:821
GPU * src_gpu
Definition cuda_internal.h:850
virtual bool supports_indirection_memory(Memory mem) const
Queries if a given mem can be used as an indirection buffer.
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
virtual Memory suggest_ib_memories() const
GPUIndirectChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork)
long submit(Request **requests, long nr)
virtual RemoteChannelInfo * construct_remote_info() const
virtual bool needs_wrapping_iterator() const
virtual uint64_t supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id, CustomSerdezID dst_serdez_id, ReductionOpID redop_id, size_t total_bytes, const std::vector< size_t > *src_frags, const std::vector< size_t > *dst_frags, XferDesKind *kind_ret=0, unsigned *bw_ret=0, unsigned *lat_ret=0)
Definition cuda_internal.h:853
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPUIndirectRemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:871
bool serialize(S &serializer) const
GPUIndirectRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths, const std::vector< Memory > &_indirect_memories)
virtual RemoteChannel * create_remote_channel()
static RemoteChannelInfo * deserialize_new(S &deserializer)
Definition cuda_internal.h:874
virtual bool needs_wrapping_iterator() const
virtual Memory suggest_ib_memories() const
virtual uint64_t supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id, CustomSerdezID dst_serdez_id, ReductionOpID redop_id, size_t total_bytes, const std::vector< size_t > *src_frags, const std::vector< size_t > *dst_frags, XferDesKind *kind_ret, unsigned *bw_ret, unsigned *lat_ret)
GPUIndirectRemoteChannel(uintptr_t _remote_ptr, const std::vector< Memory > &_indirect_memories)
Definition cuda_internal.h:719
size_t read_ind_offset
Definition cuda_internal.h:735
size_t write_size
Definition cuda_internal.h:737
size_t write_offset
Definition cuda_internal.h:737
int write_ind_port_idx
Definition cuda_internal.h:738
size_t read_offset
Definition cuda_internal.h:733
size_t write_ind_offset
Definition cuda_internal.h:739
size_t read_ind_size
Definition cuda_internal.h:735
GPUIndirectTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size, int _write_port_idx, size_t _write_offset, size_t _write_size, int _read_ind_port_idx=-1, size_t _read_ind_offset=0, size_t _read_ind_size=0, int _write_ind_port_idx=-1, size_t _write_ind_offset=0, size_t _write_ind_size=0)
int write_port_idx
Definition cuda_internal.h:736
size_t write_ind_size
Definition cuda_internal.h:739
int read_ind_port_idx
Definition cuda_internal.h:734
XferDes * xd
Definition cuda_internal.h:731
size_t read_size
Definition cuda_internal.h:733
int read_port_idx
Definition cuda_internal.h:732
Definition cuda_internal.h:798
bool progress_xd(GPUIndirectChannel *channel, TimeLimit work_until)
std::vector< bool > dst_is_ipc
Definition cuda_internal.h:810
std::vector< GPU * > dst_gpus
Definition cuda_internal.h:809
long get_requests(Request **requests, long nr)
std::vector< GPU * > src_gpus
Definition cuda_internal.h:809
GPUIndirectXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, XferDesRedopInfo _redop_info)
Definition cuda_internal.h:539
virtual void shutdown(void)
Realm::CoreReservation * core_rsrv
Definition cuda_internal.h:559
virtual bool register_task(Processor::TaskFuncID func_id, CodeDescriptor &codedesc, const ByteArrayRef &user_data)
virtual ~GPUProcessor(void)
GPU * gpu
Definition cuda_internal.h:556
GPUProcessor(RuntimeImpl *runtime_impl, GPU *_gpu, Processor _me, Realm::CoreReservationSet &crs, size_t _stack_size)
std::map< Processor::TaskFuncID, GPUTaskTableEntry > gpu_task_table
Definition cuda_internal.h:569
virtual void execute_task(Processor::TaskFuncID func_id, const ByteArrayRef &task_args)
Definition cuda_internal.h:1040
GPUReplHeapListener(CudaModule *_module)
virtual void chunk_created(void *base, size_t bytes)
virtual void chunk_destroyed(void *base, size_t bytes)
Definition cuda_internal.h:710
GPUCompletionEvent event
Definition cuda_internal.h:716
void * dst_base
Definition cuda_internal.h:713
const void * src_base
Definition cuda_internal.h:712
GPU * dst_gpu
Definition cuda_internal.h:715
Definition cuda_internal.h:247
bool ok_to_submit_copy(size_t bytes, XferDes *xd)
REALM_INTERNAL_API_EXTERNAL_LINKAGE CUstream get_stream(void) const
void add_notification(GPUCompletionNotification *notification)
void add_event(CUevent event, GPUWorkFence *fence, GPUCompletionNotification *notification=NULL, GPUWorkStart *start=NULL)
Mutex mutex
Definition cuda_internal.h:281
void add_start_event(GPUWorkStart *start)
bool has_work(void) const
GPU * gpu
Definition cuda_internal.h:276
GPU * get_gpu(void) const
GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority=0)
GPUWorker * worker
Definition cuda_internal.h:277
bool reap_events(TimeLimit work_until)
void add_fence(GPUWorkFence *fence)
std::deque< PendingEvent > pending_events
Definition cuda_internal.h:291
void wait_on_streams(const std::set< GPUStream * > &other_streams)
CUstream stream
Definition cuda_internal.h:279
Definition cuda_internal.h:742
size_t read_offset
Definition cuda_internal.h:753
int write_port_idx
Definition cuda_internal.h:754
XferDes * xd
Definition cuda_internal.h:751
size_t write_size
Definition cuda_internal.h:755
size_t write_offset
Definition cuda_internal.h:755
GPUTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size, int _write_port_idx, size_t _write_offset, size_t _write_size)
virtual void request_completed(void)
size_t read_size
Definition cuda_internal.h:753
int read_port_idx
Definition cuda_internal.h:752
Definition cuda_internal.h:203
virtual void request_cancellation(void)
IntrusiveListLink< GPUWorkFence > fence_list_link
Definition cuda_internal.h:216
GPU * gpu
Definition cuda_internal.h:224
void enqueue_on_stream(GPUStream *stream)
virtual void print(std::ostream &os) const
IntrusiveList< GPUWorkFence, REALM_PMTA_USE(GPUWorkFence, fence_list_link), DummyLock > FenceList
Definition cuda_internal.h:220
GPUWorkFence(GPU *gpu, Realm::Operation *op)
REALM_PMTA_DEFN(GPUWorkFence, IntrusiveListLink< GPUWorkFence >, fence_list_link)
static void cuda_callback(CUstream stream, CUresult res, void *data)
virtual void mark_finished(bool successful)
Definition cuda_internal.h:227
void enqueue_on_stream(GPUStream *stream)
GPUWorkStart(Realm::Operation *op)
virtual void request_cancellation(void)
Definition cuda_internal.h:231
virtual void print(std::ostream &os) const
static void cuda_start_callback(CUstream stream, CUresult res, void *data)
Definition cuda_internal.h:298
virtual ~GPUWorker(void)
bool process_streams(bool sleep_on_empty)
CircularQueue< GPUStream *, 16 > ActiveStreamQueue
Definition cuda_internal.h:325
void shutdown_background_thread(void)
ActiveStreamQueue active_streams
Definition cuda_internal.h:326
void add_stream(GPUStream *s)
void start_background_thread(Realm::CoreReservationSet &crs, size_t stack_size)
Realm::CoreReservation * core_rsrv
Definition cuda_internal.h:329
Mutex::CondVar condvar
Definition cuda_internal.h:323
Mutex lock
Definition cuda_internal.h:322
atomic< bool > worker_shutdown_requested
Definition cuda_internal.h:332
bool do_work(TimeLimit work_until)
bool thread_sleeping
Definition cuda_internal.h:331
Realm::Thread * worker_thread
Definition cuda_internal.h:330
Definition cuda_internal.h:781
bool progress_xd(GPUChannel *channel, TimeLimit work_until)
GPUXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority)
long get_requests(Request **requests, long nr)
Definition cuda_internal.h:659
NetworkSegment local_segment
Definition cuda_internal.h:687
virtual ~GPUZCMemory(void)
char * cpu_base
Definition cuda_internal.h:686
virtual void * get_direct_ptr(off_t offset, size_t size)
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
GPUZCMemory(RuntimeImpl *_runtime_impl, GPU *gpu, Memory _me, CUdeviceptr _gpu_base, void *_cpu_base, size_t _size, MemoryKind _kind, Memory::Kind _lowlevel_kind)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
virtual void put_bytes(off_t offset, const void *src, size_t size)
CUdeviceptr gpu_base
Definition cuda_internal.h:685
virtual void get_bytes(off_t offset, void *dst, size_t size)
Definition cuda_internal.h:392
void pop_context(void)
void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size)
void launch_transpose_kernel(MemcpyTransposeInfo< size_t > &copy_info, size_t elemSize, GPUStream *stream)
void create_dynamic_fb_memory(RuntimeImpl *runtime, size_t max_size)
void launch_batch_affine_kernel(void *copy_info, size_t dim, size_t elemSize, size_t volume, GPUStream *stream)
bool register_reduction(ReductionOpID redop_id, CUfunction apply_excl, CUfunction apply_nonexcl, CUfunction fold_excl, CUfunction fold_nonexcl)
CUdeviceptr fbmem_base
Definition cuda_internal.h:475
GPUFuncInfo fill_affine_large_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:472
std::vector< CudaIpcMapping > cudaipc_mappings
Definition cuda_internal.h:512
bool can_access_peer(const GPU *peer) const
GPUFBMemory * fbmem
Definition cuda_internal.h:444
GPUFuncInfo transpose_kernels[CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:473
std::unordered_map< ReductionOpID, GPUReductionOpEntry > gpu_reduction_table
Definition cuda_internal.h:524
REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUStream * get_null_task_stream(void) const
GPUStream * device_to_host_stream
Definition cuda_internal.h:490
GPUProcessor * proc
Definition cuda_internal.h:441
GPUFuncInfo batch_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:468
std::vector< GPUStream * > task_streams
Definition cuda_internal.h:494
ContextSynchronizer ctxsync
Definition cuda_internal.h:437
CUmodule device_module
Definition cuda_internal.h:450
void create_processor(RuntimeImpl *runtime, size_t stack_size)
Mutex alloc_mutex
Definition cuda_internal.h:514
std::set< Memory > managed_mems
Definition cuda_internal.h:483
GPUStream * host_to_device_stream
Definition cuda_internal.h:489
GPUStream * get_next_d2d_stream()
GPUFuncInfo indirect_copy_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:467
GPUStream * device_to_device_stream
Definition cuda_internal.h:491
size_t cupti_activity_refcount
Definition cuda_internal.h:497
std::map< NodeID, GPUStream * > cudaipc_streams
Definition cuda_internal.h:513
std::map< CUdeviceptr, GPUAllocation > allocations
Definition cuda_internal.h:443
void push_context(void)
std::set< Memory > pinned_sysmems
Definition cuda_internal.h:480
GPUFuncInfo batch_fill_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:470
static const size_t CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES
Definition cuda_internal.h:464
GPUStream * get_next_task_stream(bool create=false)
void launch_indirect_copy_kernel(void *copy_info, size_t dim, size_t addr_size, size_t field_size, size_t volume, GPUStream *stream)
CUmodule load_cuda_module(const void *data)
int least_stream_priority
Definition cuda_internal.h:503
CUdeviceptr fb_ibmem_base
Definition cuda_internal.h:477
GPUAllocation & add_allocation(GPUAllocation &&alloc)
GPUDynamicFBMemory * fb_dmem
Definition cuda_internal.h:445
void launch_batch_affine_fill_kernel(void *fill_info, size_t dim, size_t elemSize, size_t volume, GPUStream *stream)
int greatest_stream_priority
Definition cuda_internal.h:503
const CudaIpcMapping * find_ipc_mapping(Memory mem) const
GPUEventPool event_pool
Definition cuda_internal.h:499
CUcontext context
Definition cuda_internal.h:448
GPU(CudaModule *_module, GPUInfo *_info, GPUWorker *worker, CUcontext _context)
std::vector< GPUStream * > device_to_device_streams
Definition cuda_internal.h:492
GPUFBIBMemory * fb_ibmem
Definition cuda_internal.h:446
atomic< unsigned > next_task_stream
Definition cuda_internal.h:495
GPUStream * find_stream(CUstream stream) const
bool is_accessible_host_mem(const MemoryImpl *mem) const
GPUInfo * info
Definition cuda_internal.h:439
std::vector< GPUStream * > peer_to_peer_streams
Definition cuda_internal.h:493
GPUWorker * worker
Definition cuda_internal.h:440
void create_dma_channels(Realm::RuntimeImpl *r)
std::set< Memory > peer_fbs
Definition cuda_internal.h:486
atomic< unsigned > next_d2d_stream
Definition cuda_internal.h:496
bool is_accessible_gpu_mem(const MemoryImpl *mem) const
Definition cuda_internal.h:932
static const bool is_ordered
Definition cuda_internal.h:937
GPUfillChannel(GPU *_gpu, BackgroundWorkManager *bgwork)
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
long submit(Request **requests, long nr)
GPU * gpu
Definition cuda_internal.h:952
Definition cuda_internal.h:917
GPUfillXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, const void *_fill_data, size_t _fill_size, size_t _fill_total)
size_t reduced_fill_size
Definition cuda_internal.h:929
long get_requests(Request **requests, long nr)
bool progress_xd(GPUfillChannel *channel, TimeLimit work_until)
Definition cuda_internal.h:978
GPU * gpu
Definition cuda_internal.h:1002
RemoteChannelInfo * construct_remote_info() const override
static const bool is_ordered
Definition cuda_internal.h:983
XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total) override
long submit(Request **requests, long nr) override
bool supports_redop(ReductionOpID redop_id) const override
GPUreduceChannel(GPU *_gpu, BackgroundWorkManager *bgwork)
Definition cuda_internal.h:1005
GPUreduceRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths)
bool serialize(S &serializer) const
virtual RemoteChannel * create_remote_channel()
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPUreduceRemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:1021
static RemoteChannelInfo * deserialize_new(S &deserializer)
Definition cuda_internal.h:1024
Definition cuda_internal.h:957
long get_requests(Request **requests, long nr)
std::vector< bool > src_is_ipc
Definition cuda_internal.h:975
GPUStream * stream
Definition cuda_internal.h:973
GPUreduceXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, XferDesRedopInfo _redop_info)
const void * kernel_host_proxy
Definition cuda_internal.h:972
bool progress_xd(GPUreduceChannel *channel, TimeLimit work_until)
CUfunction kernel
Definition cuda_internal.h:971
XferDesRedopInfo redop_info
Definition cuda_internal.h:969
std::vector< GPU * > src_gpus
Definition cuda_internal.h:974
const ReductionOpUntyped * redop
Definition cuda_internal.h:970
Definition cuda_internal.h:758
CUarray array
Definition cuda_internal.h:763
Definition threads.h:428
Definition instance.h:405
Definition ib_memory.h:30
Definition indexspace.h:1115
Definition inst_layout.h:266
Definition tasks.h:181
Definition lists.h:66
Definition mem_impl.h:344
Definition proc_impl.h:141
Definition mem_impl.h:212
Definition mem_impl.h:50
MemoryKind
Definition mem_impl.h:53
size_t size
Definition mem_impl.h:195
AllocationResult
Definition mem_impl.h:89
Definition memory.h:33
Kind
Definition memory.h:59
Definition module.h:100
Definition network.h:262
Definition operation.h:75
Operation * op
Definition operation.h:87
Definition operation.h:32
Definition processor.h:37
::realm_task_func_id_t TaskFuncID
Definition processor.h:58
Definition inst_impl.h:54
Definition channel.h:891
Definition channel.h:934
Definition repl_heap.h:50
Definition channel.h:103
Definition runtime_impl.h:264
Definition channel.h:904
Definition channel.h:1014
Definition tasks.h:199
Definition tasks.h:41
Definition threads.h:89
Definition timers.h:129
Definition mutex.h:325
Definition mutex.h:223
Definition channel.h:286
Channel * channel
Definition channel.h:343
Definition atomics.h:31
Definition utils.h:84
#define REALM_INTERNAL_API_EXTERNAL_LINKAGE
Definition compiler_support.h:218
#define CUDA_DRIVER_APIS(__op__)
Definition cuda_internal.h:1315
#define NVML_APIS(__op__)
Definition cuda_internal.h:1456
#define DECL_FNPTR_EXTERN(name, ver)
Definition cuda_internal.h:1417
#define CUPTI_APIS(__op__)
Definition cuda_internal.h:1474
#define cudaDeviceProp
Definition hip_cuda.h:24
#define REALM_PMTA_USE(structtype, name)
Definition lists.h:42
CudaModule * cuda_module_singleton
CUresult cuGetProcAddress(const char *, void **, int, int)
CUresult cuCtxRecordEvent(CUcontext hctx, CUevent event)
GPUMemcpyKind
Definition cuda_internal.h:162
@ GPU_MEMCPY_PEER_TO_PEER
Definition cuda_internal.h:166
@ GPU_MEMCPY_HOST_TO_DEVICE
Definition cuda_internal.h:163
@ GPU_MEMCPY_DEVICE_TO_HOST
Definition cuda_internal.h:164
@ GPU_MEMCPY_DEVICE_TO_DEVICE
Definition cuda_internal.h:165
nvmlReturn_t nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType)
nvmlIntNvLinkDeviceType_enum
Definition cuda_internal.h:1444
@ NVML_NVLINK_DEVICE_TYPE_IBMNPU
Definition cuda_internal.h:1446
@ NVML_NVLINK_DEVICE_TYPE_SWITCH
Definition cuda_internal.h:1447
@ NVML_NVLINK_DEVICE_TYPE_UNKNOWN
Definition cuda_internal.h:1448
@ NVML_NVLINK_DEVICE_TYPE_GPU
Definition cuda_internal.h:1445
enum Realm::Cuda::nvmlIntNvLinkDeviceType_enum nvmlIntNvLinkDeviceType_t
Definition activemsg.h:38
int NodeID
Definition nodeset.h:40
XferDesKind
Definition channel.h:85
int CustomSerdezID
Definition custom_serdez.h:148
int OsHandle
Definition utils.h:399
unsigned long long XferDesID
Definition channel.h:57
::realm_reduction_op_id_t ReductionOpID
Definition event.h:38
#define REALM_MAX_DIM
Definition realm_config.h:34
Definition channel.h:684
Definition cuda_internal.h:1031
static void handle_message(NodeID sender, const CudaIpcImportRequest &args, const void *data, size_t datalen)
unsigned count
Definition cuda_internal.h:1032
long hostid
Definition cuda_internal.h:1034
Definition cudart_hijack.h:53
Definition cuda_internal.h:127
int pci_busid
Definition cuda_internal.h:141
CUdevice device
Definition cuda_internal.h:129
size_t pci_bandwidth
Definition cuda_internal.h:145
std::vector< size_t > logical_peer_bandwidth
Definition cuda_internal.h:148
int pci_domainid
Definition cuda_internal.h:142
CUuuid uuid
Definition cuda_internal.h:131
std::set< CUdevice > peers
Definition cuda_internal.h:140
bool has_numa_preference
Definition cuda_internal.h:138
bool pageable_access_supported
Definition cuda_internal.h:154
std::vector< size_t > logical_peer_latency
Definition cuda_internal.h:149
bool host_gpu_same_va
Definition cuda_internal.h:147
unsigned fabric_clique
Definition cuda_internal.h:152
bool fabric_supported
Definition cuda_internal.h:151
char name[MAX_NAME_LEN]
Definition cuda_internal.h:135
int major
Definition cuda_internal.h:132
size_t totalGlobalMem
Definition cuda_internal.h:136
int pci_deviceid
Definition cuda_internal.h:143
nvmlDevice_t nvml_dev
Definition cuda_internal.h:130
unsigned long numa_node_affinity[MAX_NUMA_NODE_LEN]
Definition cuda_internal.h:139
size_t c2c_bandwidth
Definition cuda_internal.h:144
int index
Definition cuda_internal.h:128
int minor
Definition cuda_internal.h:133
size_t nvswitch_bandwidth
Definition cuda_internal.h:146
CUuuid fabric_uuid
Definition cuda_internal.h:153
static const size_t MAX_NAME_LEN
Definition cuda_internal.h:134
static const size_t MAX_NUMA_NODE_LEN
Definition cuda_internal.h:137
Definition cuda_internal.h:561
Cuda::StreamAwareTaskFuncPtr stream_aware_fnptr
Definition cuda_internal.h:563
Processor::TaskFuncPtr fnptr
Definition cuda_internal.h:562
ByteArray user_data
Definition cuda_internal.h:564
Definition cuda_internal.h:282
GPUWorkStart * start
Definition cuda_internal.h:285
CUevent event
Definition cuda_internal.h:283
GPUWorkFence * fence
Definition cuda_internal.h:284
GPUCompletionNotification * notification
Definition cuda_internal.h:286
Definition cuda_internal.h:505
uintptr_t address_offset
Definition cuda_internal.h:510
NodeID owner
Definition cuda_internal.h:506
GPU * src_gpu
Definition cuda_internal.h:507
Memory mem
Definition cuda_internal.h:508
uintptr_t local_base
Definition cuda_internal.h:509
Definition cuda_internal.h:452
CUfunction func
Definition cuda_internal.h:453
int occ_num_threads
Definition cuda_internal.h:454
int occ_num_blocks
Definition cuda_internal.h:455
Definition cuda_internal.h:517
CUfunction fold_excl
Definition cuda_internal.h:521
CUfunction apply_nonexcl
Definition cuda_internal.h:518
CUfunction fold_nonexcl
Definition cuda_internal.h:520
CUfunction apply_excl
Definition cuda_internal.h:519
Definition cuda_memcpy.h:109
Definition cudart_hijack.h:65
Definition cudart_hijack.h:76
Definition redop.h:56
Definition channel.h:210
NodeID src
Definition ucp_internal.h:1