Realm
A distributed, event-based tasking library
Loading...
Searching...
No Matches
cuda_module.h
Go to the documentation of this file.
1/*
2 * Copyright 2025 Stanford University, NVIDIA Corporation
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef REALM_CUDA_H
19#define REALM_CUDA_H
20
21#include <type_traits>
22#include "realm/realm_config.h"
23#include "realm/module.h"
24#include "realm/processor.h"
25#include "realm/network.h"
26#include "realm/atomics.h"
27
28// realm/cuda_module.h is designed to be include-able even when the system
29// doesn't actually have CUDA installed, so we need to declare types that
30// are compatible with the CUDA driver and runtime APIs - we can't "extern"
31// a typedef (e.g. cudaStream_t) but we can forward declare the underlying
32// struct that those types are pointers to
33struct CUstream_st; // cudaStream_t == CUstream == CUstream_st *
34struct CUevent_st;
35struct CUctx_st;
36struct CUfunc_st;
37
38namespace Realm {
39
40 namespace NetworkSegmentInfo {
41 // CUDA device memory - extra is a uintptr_t'd pointer to the GPU
42 // object
43 static const MemoryType CudaDeviceMem = 2;
44
45 // CUDA managed memory - extra is a uintptr_t'd pointer to _one of_
46 // the GPU objects
47 static const MemoryType CudaManagedMem = 4;
48 }; // namespace NetworkSegmentInfo
49
50 namespace Cuda {
51
52 // a running task on a CUDA processor is assigned a stream by Realm, and
53 // any work placed on this stream is automatically captured by the
54 // completion event for the task
55 // when using the CUDA runtime hijack, Realm will force work launched via
56 // the runtime API to use the task's stream, but without hijack, or for
57 // code that uses the CUDA driver API, the task must explicitly request
58 // the stream that is associated with the task and place work on it to
59 // avoid more expensive forms of completion detection for the task
60 // NOTE: this function will return a null pointer if called outside of a
61 // task running on a CUDA processor
63
64 // when Realm is not using the CUDA runtime hijack to force work onto the
65 // task's stream, it conservatively uses a full context synchronization to
66 // make sure all device work launched by the task is captured by the task
67 // completion event - if a task uses `get_task_cuda_stream` and places all
68 // work on that stream, this API can be used to tell Realm on a per-task
69 // basis that full context synchronization is not required
71
72 // rather than using the APIs above, CUDA processors also support task
73 // implementations that are natively stream aware - if a task function uses
74 // the `Cuda::StreamAwareTaskFuncPtr` prototype below (instead of the normal
75 // `Processor::TaskFuncPtr`), the following differences apply:
76 // a) it need not call `get_task_cuda_stream` because it gets the same value
77 // directly as an argument
78 // b) by default, a context synchronization will NOT be performed as part of
79 // task completion detection (this can still be overridden with a call to
80 // `set_task_ctxsync_required(true)` if a task puts work outside the
81 // specified stream for some reason
82 // c) if a stream-aware task has preconditions that involve device work, that
83 // work will be tied into the task's stream, but the task body may start
84 // executing BEFORE that work is complete (i.e. for correctness, all work
85 // launched by the task must be properly ordered (using the CUDA APIs)
86 // after anything already in the stream assigned to the task
87 typedef void (*StreamAwareTaskFuncPtr)(const void *args, size_t arglen,
88 const void *user_data, size_t user_data_len,
89 Processor proc, CUstream_st *stream);
90
91 // this is the same data structure of CUuuid
92 static const size_t UUID_SIZE = 16; // bytes
93 typedef char Uuid[UUID_SIZE];
94
95 // fill in cude related info according to CUDA-capable device associated with
96 // processor
97 // `p` if available and returns true, or returns false if processor is unknown,
98 // not associated with a CUDA-capable device, or information is unavailable
100
102
103 class GPU;
104 class GPUWorker;
105 struct GPUInfo;
106 class GPUZCMemory;
108
110 friend class CudaModule;
111
112 protected:
114
116
117 public:
118 virtual void configure_from_cmdline(std::vector<std::string> &cmdline);
119
120 public:
121 // configurations
122 size_t cfg_zc_mem_size = 64 << 20, cfg_zc_ib_size = 256 << 20;
123 size_t cfg_fb_mem_size = 256 << 20, cfg_fb_ib_size = 128 << 20;
126 size_t cfg_dynfb_max_size = ~size_t(0);
128 std::string cfg_gpu_idxs;
134 unsigned cfg_skip_gpu_count = 0;
135 bool cfg_skip_busy_gpus = false;
137 int cfg_task_legacy_sync = 0; // 0 = no, 1 = yes
138 int cfg_task_context_sync = -1; // 0 = no, 1 = yes, -1 = default (based on hijack)
142 size_t cfg_hostreg_limit = 1 << 30;
144 bool cfg_use_cuda_ipc = true;
146 bool cfg_enable_cupti = false;
147
148 // resources
152 std::vector<size_t> res_fbmem_sizes;
153 };
154
158 CUfunc_st *apply_excl = nullptr;
159 CUfunc_st *apply_nonexcl = nullptr;
160 CUfunc_st *fold_excl = nullptr;
161 CUfunc_st *fold_nonexcl = nullptr;
162 };
163
164 // our interface to the rest of the runtime
166 protected:
168
169 public:
170 virtual ~CudaModule(void);
171
173
175
176 // do any general initialization - this is called after all configuration is
177 // complete
178 virtual void initialize(RuntimeImpl *runtime);
179
180 // create any memories provided by this module (default == do nothing)
181 // (each new MemoryImpl should use a Memory from RuntimeImpl::next_local_memory_id)
182 virtual void create_memories(RuntimeImpl *runtime);
183
184 // create any processors provided by the module (default == do nothing)
185 // (each new ProcessorImpl should use a Processor from
186 // RuntimeImpl::next_local_processor_id)
187 virtual void create_processors(RuntimeImpl *runtime);
188
189 // create any DMA channels provided by the module (default == do nothing)
190 virtual void create_dma_channels(RuntimeImpl *runtime);
191
192 // create any code translators provided by the module (default == do nothing)
193 virtual void create_code_translators(RuntimeImpl *runtime);
194
195 // if a module has to do cleanup that involves sending messages to other
196 // nodes, this must be done in the pre-detach cleanup
197 virtual void pre_detach_cleanup(void);
198
199 // clean up any common resources created by the module - this will be called
200 // after all memories/processors/etc. have been shut down and destroyed
201 virtual void cleanup(void);
202
203 // free functions above are normally used, but these can be used directly
204 // if you already have a pointer to the CudaModule
205 CUstream_st *get_task_cuda_stream();
206 void set_task_ctxsync_required(bool is_required);
207
213 Event make_realm_event(CUevent_st *cuda_event);
220 Event make_realm_event(CUstream_st *cuda_stream);
221
222 bool get_cuda_device_uuid(Processor p, Uuid *uuid) const;
223
224 bool get_cuda_device_id(Processor p, int *device) const;
225
226 bool get_cuda_context(Processor p, CUctx_st **context) const;
227
228 bool register_reduction(Event &event, const CudaRedOpDesc *descs, size_t num);
229
230 public:
233
234 // "global" variables live here too
236 std::map<GPU *, GPUWorker *> dedicated_workers;
237 std::vector<GPUInfo *> gpu_info;
238 std::vector<GPU *> gpus;
239 void *zcmem_cpu_base, *zcib_cpu_base;
241 void *uvm_base; // guaranteed to be same for CPU and GPU
245
248 atomic<size_t> cudaipc_responses_received{0};
249 int cuda_api_version = 0;
250 };
251
252 }; // namespace Cuda
253
254}; // namespace Realm
255
256#include "realm/cuda/cuda_module.inl"
257
258#endif
Definition cuda_module.h:109
bool cfg_pin_sysmem
Definition cuda_module.h:131
std::vector< size_t > res_fbmem_sizes
Definition cuda_module.h:152
bool cfg_fences_use_callbacks
Definition cuda_module.h:132
int cfg_task_legacy_sync
Definition cuda_module.h:137
size_t cfg_zc_mem_size
Definition cuda_module.h:122
size_t cfg_fb_mem_size
Definition cuda_module.h:123
size_t cfg_dynfb_max_size
Definition cuda_module.h:126
bool cfg_multithread_dma
Definition cuda_module.h:141
bool cfg_use_dynamic_fb
Definition cuda_module.h:125
bool cfg_use_worker_threads
Definition cuda_module.h:130
bool cfg_enable_cupti
Definition cuda_module.h:146
size_t res_min_fbmem_size
Definition cuda_module.h:151
unsigned cfg_d2d_streams
Definition cuda_module.h:129
int cfg_d2d_stream_priority
Definition cuda_module.h:143
std::string cfg_gpu_idxs
Definition cuda_module.h:128
virtual void configure_from_cmdline(std::vector< std::string > &cmdline)
unsigned cfg_task_streams
Definition cuda_module.h:129
bool cfg_suppress_hijack_warning
Definition cuda_module.h:133
size_t cfg_fb_ib_size
Definition cuda_module.h:123
int cfg_pageable_access
Definition cuda_module.h:145
size_t cfg_zc_ib_size
Definition cuda_module.h:122
int res_num_gpus
Definition cuda_module.h:150
bool cfg_use_shared_worker
Definition cuda_module.h:130
size_t cfg_hostreg_limit
Definition cuda_module.h:142
int cfg_task_context_sync
Definition cuda_module.h:138
bool cfg_skip_busy_gpus
Definition cuda_module.h:135
unsigned cfg_skip_gpu_count
Definition cuda_module.h:134
bool cfg_use_cuda_ipc
Definition cuda_module.h:144
bool resource_discovered
Definition cuda_module.h:149
int cfg_num_gpus
Definition cuda_module.h:127
bool cfg_lmem_resize_to_max
Definition cuda_module.h:140
int cfg_max_ctxsync_threads
Definition cuda_module.h:139
size_t cfg_min_avail_mem
Definition cuda_module.h:136
size_t cfg_uvm_mem_size
Definition cuda_module.h:124
Definition cuda_module.h:165
virtual void initialize(RuntimeImpl *runtime)
virtual void create_dma_channels(RuntimeImpl *runtime)
bool get_cuda_context(Processor p, CUctx_st **context) const
virtual void create_memories(RuntimeImpl *runtime)
CudaModule(RuntimeImpl *_runtime)
GPUReplHeapListener * rh_listener
Definition cuda_module.h:243
atomic< bool > initialization_complete
Definition cuda_module.h:244
Mutex cudaipc_mutex
Definition cuda_module.h:246
CUstream_st * get_task_cuda_stream()
CudaModuleConfig * config
Definition cuda_module.h:231
GPUZCMemory * uvmmem
Definition cuda_module.h:242
std::vector< GPU * > gpus
Definition cuda_module.h:238
virtual void create_processors(RuntimeImpl *runtime)
GPUZCMemory * zcmem
Definition cuda_module.h:240
void * zcib_cpu_base
Definition cuda_module.h:239
void * uvm_base
Definition cuda_module.h:241
virtual void create_code_translators(RuntimeImpl *runtime)
GPUWorker * shared_worker
Definition cuda_module.h:235
RuntimeImpl * runtime
Definition cuda_module.h:232
bool register_reduction(Event &event, const CudaRedOpDesc *descs, size_t num)
void set_task_ctxsync_required(bool is_required)
static ModuleConfig * create_module_config(RuntimeImpl *runtime)
bool get_cuda_device_id(Processor p, int *device) const
virtual void cleanup(void)
virtual void pre_detach_cleanup(void)
Mutex::CondVar cudaipc_condvar
Definition cuda_module.h:247
Event make_realm_event(CUevent_st *cuda_event)
Returns a Realm::Event that will be triggered after the given cuda_event has completed.
Event make_realm_event(CUstream_st *cuda_stream)
Returns a Realm::Event that will be triggered after the given cuda_stream has completed it's currentl...
std::vector< GPUInfo * > gpu_info
Definition cuda_module.h:237
virtual ~CudaModule(void)
std::map< GPU *, GPUWorker * > dedicated_workers
Definition cuda_module.h:236
bool get_cuda_device_uuid(Processor p, Uuid *uuid) const
static Module * create_module(RuntimeImpl *runtime)
Definition cuda_internal.h:1040
Definition cuda_internal.h:298
Definition cuda_internal.h:659
Definition cuda_internal.h:392
Definition event.h:50
Definition module_config.h:32
Definition module.h:42
Definition processor.h:37
static const Processor NO_PROC
Definition processor.h:54
Definition runtime_impl.h:264
Definition mutex.h:325
Definition mutex.h:223
Definition atomics.h:31
#define REALM_PUBLIC_API
Definition compiler_support.h:217
REALM_PUBLIC_API bool get_cuda_device_id(Processor p, int *device)
REALM_PUBLIC_API CUstream_st * get_task_cuda_stream()
REALM_PUBLIC_API void set_task_ctxsync_required(bool is_required)
char Uuid[UUID_SIZE]
Definition cuda_module.h:93
REALM_PUBLIC_API bool get_cuda_device_uuid(Processor p, Uuid *uuid)
void(* StreamAwareTaskFuncPtr)(const void *args, size_t arglen, const void *user_data, size_t user_data_len, Processor proc, CUstream_st *stream)
Definition cuda_module.h:87
unsigned MemoryType
Definition network.h:244
Definition activemsg.h:38
::realm_reduction_op_id_t ReductionOpID
Definition event.h:38
Definition cuda_module.h:155
CUfunc_st * fold_nonexcl
Definition cuda_module.h:161
CUfunc_st * apply_excl
Definition cuda_module.h:158
CUfunc_st * fold_excl
Definition cuda_module.h:160
Processor proc
Definition cuda_module.h:157
ReductionOpID redop_id
Definition cuda_module.h:156
CUfunc_st * apply_nonexcl
Definition cuda_module.h:159
Definition cuda_internal.h:127