Realm
A distributed, event-based tasking library
Loading...
Searching...
No Matches
hip_module.h
Go to the documentation of this file.
1/*
2 * Copyright 2025 Stanford University, NVIDIA Corporation, Los Alamos National Laboratory
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef REALM_HIP_H
19#define REALM_HIP_H
20
21#include "realm/realm_config.h"
22#include "realm/module.h"
23#include "realm/processor.h"
24#include "realm/network.h"
25#include "realm/atomics.h"
26
27// realm/hip_module.h is designed to be include-able even when the system
28// doesn't actually have HIP installed, so we need to declare types that
29// are compatible with the HIP runtime APIs - we can't "extern"
30// a typedef (e.g. hipStream_t) but we can forward declare the underlying
31// struct that those types are pointers to
32#ifdef __HIP_PLATFORM_NVIDIA__
33struct CUstream_st; // cudaStream_t == CUstream_st *
34typedef CUstream_st unifiedHipStream_t;
35#else
36struct ihipStream_t; // hipStream_t == ihipStream_t *
37typedef ihipStream_t unifiedHipStream_t;
38#endif
39
40namespace Realm {
41
42 namespace NetworkSegmentInfo {
43 // HIP device memory - extra is a uintptr_t'd pointer to the GPU
44 // object
45 static const MemoryType HipDeviceMem = 3;
46
47 // CUDA managed memory - extra is a uintptr_t'd pointer to _one of_
48 // the GPU objects
49 static const MemoryType HipManagedMem = 4;
50 }; // namespace NetworkSegmentInfo
51
52 namespace Hip {
53
54 // a running task on a HIP processor is assigned a stream by Realm, and
55 // any work placed on this stream is automatically captured by the
56 // completion event for the task
57 // when using the HIP runtime hijack, Realm will force work launched via
58 // the runtime API to use the task's stream, but without hijack, or for
59 // code that uses the HIP runtime API, the task must explicitly request
60 // the stream that is associated with the task and place work on it to
61 // avoid more expensive forms of completion detection for the task
62 // NOTE: this function will return a null pointer if called outside of a
63 // task running on a HIP processor
65
66 // when Realm is not using the HIP runtime hijack to force work onto the
67 // task's stream, it conservatively uses a full context synchronization to
68 // make sure all device work launched by the task is captured by the task
69 // completion event - if a task uses `get_task_hip_stream` and places all
70 // work on that stream, this API can be used to tell Realm on a per-task
71 // basis that full context synchronization is not required
73
74 // rather than using the APIs above, HIP processors also support task
75 // implementations that are natively stream aware - if a task function uses
76 // the `Hip::StreamAwareTaskFuncPtr` prototype below (instead of the normal
77 // `Processor::TaskFuncPtr`), the following differences apply:
78 // a) it need not call `get_task_hip_stream` because it gets the same value
79 // directly as an argument
80 // b) by default, a context synchronization will NOT be performed as part of
81 // task completion detection (this can still be overridden with a call to
82 // `set_task_ctxsync_required(true)` if a task puts work outside the
83 // specified stream for some reason
84 // c) if a stream-aware task has preconditions that involve device work, that
85 // work will be tied into the task's stream, but the task body may start
86 // executing BEFORE that work is complete (i.e. for correctness, all work
87 // launched by the task must be properly ordered (using the HIP APIs)
88 // after anything already in the stream assigned to the task
89 typedef void (*StreamAwareTaskFuncPtr)(const void *args, size_t arglen,
90 const void *user_data, size_t user_data_len,
91 Processor proc, unifiedHipStream_t *stream);
92
93 class GPU;
94 class GPUWorker;
95 struct GPUInfo;
96 class GPUZCMemory;
98
100 friend class HipModule;
101
102 protected:
104
106
107 public:
108 virtual void configure_from_cmdline(std::vector<std::string> &cmdline);
109
110 public:
111 // configurations
112 size_t cfg_zc_mem_size = 64 << 20, cfg_zc_ib_size = 256 << 20;
113 size_t cfg_fb_mem_size = 256 << 20, cfg_fb_ib_size = 128 << 20;
116 size_t cfg_dynfb_max_size = ~size_t(0);
118 std::string cfg_gpu_idxs;
124 unsigned cfg_skip_gpu_count = 0;
125 bool cfg_skip_busy_gpus = false;
127 int cfg_task_context_sync = -1; // 0 = no, 1 = yes, -1 = default (based on hijack)
130 size_t cfg_hostreg_limit = 1 << 30;
132 bool cfg_use_hip_ipc = true;
133
134 // resources
138 std::vector<size_t> res_fbmem_sizes;
139 };
140
141 // our interface to the rest of the runtime
143 protected:
145
146 public:
147 virtual ~HipModule(void);
148
150
152
153 // do any general initialization - this is called after all configuration is
154 // complete
155 virtual void initialize(RuntimeImpl *runtime);
156
157 // create any memories provided by this module (default == do nothing)
158 // (each new MemoryImpl should use a Memory from RuntimeImpl::next_local_memory_id)
159 virtual void create_memories(RuntimeImpl *runtime);
160
161 // create any processors provided by the module (default == do nothing)
162 // (each new ProcessorImpl should use a Processor from
163 // RuntimeImpl::next_local_processor_id)
164 virtual void create_processors(RuntimeImpl *runtime);
165
166 // create any DMA channels provided by the module (default == do nothing)
167 virtual void create_dma_channels(RuntimeImpl *runtime);
168
169 // create any code translators provided by the module (default == do nothing)
170 virtual void create_code_translators(RuntimeImpl *runtime);
171
172 // if a module has to do cleanup that involves sending messages to other
173 // nodes, this must be done in the pre-detach cleanup
174 virtual void pre_detach_cleanup(void);
175
176 // clean up any common resources created by the module - this will be called
177 // after all memories/processors/etc. have been shut down and destroyed
178 virtual void cleanup(void);
179
180 // free functions above are normally used, but these can be used directly
181 // if you already have a pointer to the HipModule
183 void set_task_ctxsync_required(bool is_required);
184
185 public:
188
189 // "global" variables live here too
191 std::map<GPU *, GPUWorker *> dedicated_workers;
192 std::vector<GPUInfo *> gpu_info;
193 std::vector<GPU *> gpus;
194 void *zcmem_cpu_base, *zcib_cpu_base;
196 void *uvm_base; // guaranteed to be same for CPU and GPU
198 std::vector<void *> registered_host_ptrs;
200
206 };
207
208 }; // namespace Hip
209
210}; // namespace Realm
211
212#include "realm/hip/hip_module.inl"
213
214#endif
Definition hip_internal.h:869
Definition hip_internal.h:227
Definition hip_internal.h:612
Definition hip_internal.h:322
Definition hip_module.h:99
virtual void configure_from_cmdline(std::vector< std::string > &cmdline)
int cfg_max_ctxsync_threads
Definition hip_module.h:128
bool cfg_pin_sysmem
Definition hip_module.h:121
size_t cfg_zc_mem_size
Definition hip_module.h:112
size_t res_min_fbmem_size
Definition hip_module.h:137
size_t cfg_uvm_mem_size
Definition hip_module.h:114
int cfg_d2d_stream_priority
Definition hip_module.h:131
bool cfg_use_dynamic_fb
Definition hip_module.h:115
size_t cfg_fb_ib_size
Definition hip_module.h:113
size_t cfg_hostreg_limit
Definition hip_module.h:130
size_t cfg_min_avail_mem
Definition hip_module.h:126
bool cfg_fences_use_callbacks
Definition hip_module.h:122
bool cfg_use_worker_threads
Definition hip_module.h:120
int cfg_task_context_sync
Definition hip_module.h:127
std::string cfg_gpu_idxs
Definition hip_module.h:118
bool cfg_multithread_dma
Definition hip_module.h:129
size_t cfg_dynfb_max_size
Definition hip_module.h:116
int res_num_gpus
Definition hip_module.h:136
size_t cfg_fb_mem_size
Definition hip_module.h:113
bool cfg_suppress_hijack_warning
Definition hip_module.h:123
bool cfg_use_hip_ipc
Definition hip_module.h:132
std::vector< size_t > res_fbmem_sizes
Definition hip_module.h:138
unsigned cfg_task_streams
Definition hip_module.h:119
unsigned cfg_d2d_streams
Definition hip_module.h:119
int cfg_num_gpus
Definition hip_module.h:117
size_t cfg_zc_ib_size
Definition hip_module.h:112
bool cfg_use_shared_worker
Definition hip_module.h:120
bool resource_discovered
Definition hip_module.h:135
bool cfg_skip_busy_gpus
Definition hip_module.h:125
unsigned cfg_skip_gpu_count
Definition hip_module.h:124
Definition hip_module.h:142
virtual void cleanup(void)
static Module * create_module(RuntimeImpl *runtime)
virtual void create_memories(RuntimeImpl *runtime)
GPUWorker * shared_worker
Definition hip_module.h:190
GPUReplHeapListener * rh_listener
Definition hip_module.h:199
virtual void create_processors(RuntimeImpl *runtime)
GPUZCMemory * zcmem
Definition hip_module.h:195
virtual ~HipModule(void)
std::map< GPU *, GPUWorker * > dedicated_workers
Definition hip_module.h:191
RuntimeImpl * runtime
Definition hip_module.h:187
unifiedHipStream_t * get_task_hip_stream()
virtual void pre_detach_cleanup(void)
virtual void initialize(RuntimeImpl *runtime)
Mutex::CondVar hipipc_condvar
Definition hip_module.h:202
atomic< int > hipipc_exports_remaining
Definition hip_module.h:205
atomic< int > hipipc_responses_needed
Definition hip_module.h:203
GPUZCMemory * uvmmem
Definition hip_module.h:197
void * zcib_cpu_base
Definition hip_module.h:194
HipModuleConfig * config
Definition hip_module.h:186
Mutex hipipc_mutex
Definition hip_module.h:201
virtual void create_code_translators(RuntimeImpl *runtime)
void set_task_ctxsync_required(bool is_required)
std::vector< void * > registered_host_ptrs
Definition hip_module.h:198
static ModuleConfig * create_module_config(RuntimeImpl *runtime)
virtual void create_dma_channels(RuntimeImpl *runtime)
HipModule(RuntimeImpl *_runtime)
std::vector< GPU * > gpus
Definition hip_module.h:193
std::vector< GPUInfo * > gpu_info
Definition hip_module.h:192
atomic< int > hipipc_releases_needed
Definition hip_module.h:204
void * uvm_base
Definition hip_module.h:196
Definition module_config.h:32
Definition module.h:42
Definition processor.h:37
Definition runtime_impl.h:264
Definition mutex.h:325
Definition mutex.h:223
Definition atomics.h:31
#define REALM_PUBLIC_API
Definition compiler_support.h:217
ihipStream_t unifiedHipStream_t
Definition hip_module.h:37
REALM_PUBLIC_API void set_task_ctxsync_required(bool is_required)
void(* StreamAwareTaskFuncPtr)(const void *args, size_t arglen, const void *user_data, size_t user_data_len, Processor proc, unifiedHipStream_t *stream)
Definition hip_module.h:89
REALM_PUBLIC_API unifiedHipStream_t * get_task_hip_stream()
unsigned MemoryType
Definition network.h:244
Definition activemsg.h:38
Definition hip_internal.h:69