Realm
A distributed, event-based tasking library
Loading...
Searching...
No Matches
redop.h
Go to the documentation of this file.
1/*
2 * Copyright 2025 Stanford University, NVIDIA Corporation
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18// reduction ops for Realm
19
20#ifndef REALM_REDOP_H
21#define REALM_REDOP_H
22
23#include "realm/realm_config.h"
24
25#ifdef REALM_USE_CUDA
27#endif
28
29#ifdef REALM_USE_HIP
30#include "realm/hip/hip_redop.h"
31#endif
32
33#include <cstddef>
34#include <type_traits>
35
36namespace Realm {
37
38 // a reduction op needs to look like this
39#ifdef NOT_REALLY_CODE
40 class MyReductionOp {
41 public:
42 typedef int LHS;
43 typedef int RHS;
44
45 void apply(LHS &lhs, RHS rhs) const;
46
47 // both of these are optional
48 static const RHS identity;
49 void fold(RHS &rhs1, RHS rhs2) const;
50 };
51#endif
52
53 template <typename REDOP>
54 struct ReductionOp;
55
57 size_t sizeof_this; // includes any identity val or user data after struct
58 size_t sizeof_lhs;
59 size_t sizeof_rhs;
60 size_t sizeof_userdata; // extra data supplied to apply/fold
61 void *identity; // if non-null, points into same object
62 void *userdata; // if non-null, points into same object
63
64 // CPU apply/fold functions - tolerate strided src/dst
65 void (*cpu_apply_excl_fn)(void *lhs_ptr, size_t lhs_stride, const void *rhs_ptr,
66 size_t rhs_stride, size_t count, const void *userdata);
67 void (*cpu_apply_nonexcl_fn)(void *lhs_ptr, size_t lhs_stride, const void *rhs_ptr,
68 size_t rhs_stride, size_t count, const void *userdata);
69 void (*cpu_fold_excl_fn)(void *rhs1_ptr, size_t rhs1_stride, const void *rhs2_ptr,
70 size_t rhs2_stride, size_t count, const void *userdata);
71 void (*cpu_fold_nonexcl_fn)(void *rhs1_ptr, size_t rhs1_stride, const void *rhs2_ptr,
72 size_t rhs2_stride, size_t count, const void *userdata);
73
74#ifdef REALM_USE_CUDA
75 // CUDA kernels for apply/fold - these are not actually the functions,
76 // but just information (e.g. host wrapper fnptr) that can be used
77 // to look up the actual kernels
78 void *cuda_apply_excl_fn, *cuda_apply_nonexcl_fn;
79 void *cuda_fold_excl_fn, *cuda_fold_nonexcl_fn;
80 void *cuda_apply_excl_fn_advanced, *cuda_apply_nonexcl_fn_advanced;
81 void *cuda_fold_excl_fn_advanced, *cuda_fold_nonexcl_fn_advanced;
82 void *cuda_apply_excl_fn_transpose, *cuda_apply_nonexcl_fn_transpose;
83 void *cuda_fold_excl_fn_transpose, *cuda_fold_nonexcl_fn_transpose;
84 // These function pointers make the connection to the app's runtime
85 // instance in order to properly translate and capture the correct
86 // function to launch.
87 // The runtime function pointer to launch these shadow symbols
88 void *cudaLaunchKernel_fn;
89 // The runtime function pointer to translate the host shadow symbol to a driver
90 // function
91 void *cudaGetFuncBySymbol_fn;
92#endif
93#ifdef REALM_USE_HIP
94 // HIP kernels for apply/fold - these are not actually the functions,
95 // but just information (e.g. host wrapper fnptr) that can be used
96 // to look up the actual kernels
97 void *hip_apply_excl_fn, *hip_apply_nonexcl_fn;
98 void *hip_fold_excl_fn, *hip_fold_nonexcl_fn;
99#endif
100
103 , sizeof_lhs(0)
104 , sizeof_rhs(0)
105 , sizeof_userdata(0)
106 , identity(0)
107 , userdata(0)
112#ifdef REALM_USE_CUDA
113 , cuda_apply_excl_fn(0)
114 , cuda_apply_nonexcl_fn(0)
115 , cuda_fold_excl_fn(0)
116 , cuda_fold_nonexcl_fn(0)
117 , cuda_apply_excl_fn_advanced(0)
118 , cuda_apply_nonexcl_fn_advanced(0)
119 , cuda_fold_excl_fn_advanced(0)
120 , cuda_fold_nonexcl_fn_advanced(0)
121 , cuda_apply_excl_fn_transpose(0)
122 , cuda_apply_nonexcl_fn_transpose(0)
123 , cuda_fold_excl_fn_transpose(0)
124 , cuda_fold_nonexcl_fn_transpose(0)
125 , cudaLaunchKernel_fn(0)
126 , cudaGetFuncBySymbol_fn(0)
127#endif
128#ifdef REALM_USE_HIP
129 , hip_apply_excl_fn(0)
130 , hip_apply_nonexcl_fn(0)
131 , hip_fold_excl_fn(0)
132 , hip_fold_nonexcl_fn(0)
133#endif
134 {}
135
136 template <class REDOP>
138 {
139 // reduction ops are allowed to use helper constructors, but are
140 // type-erased inside of realm, so must be trivially copyable and
141 // trivially destructible (we will use malloc/memcpy/free instead
142 // of new/delete)
143 // FIXME:
144 // TODO:
145 // Re-enable when the examples used in legion CI are fixed.
146#if 0
147 static_assert(std::is_trivially_copyable<ReductionOp<REDOP>>::value &&
148 std::is_trivially_destructible<ReductionOp<REDOP>>::value,
149 "ReductionOp<REDOP> must be trivially copyable/destructible");
150#endif
151 void *ptr = malloc(sizeof(ReductionOp<REDOP>));
152 if(ptr) {
153 ReductionOpUntyped *redop = new(ptr) ReductionOp<REDOP>;
154 return redop;
155 } else
156 return nullptr;
157 }
158
160 };
161
162 namespace ReductionKernels {
163 template <typename REDOP, bool EXCL>
164 void cpu_apply_wrapper(void *lhs_ptr, size_t lhs_stride, const void *rhs_ptr,
165 size_t rhs_stride, size_t count, const void *userdata)
166 {
167 const REDOP *redop = static_cast<const REDOP *>(userdata);
168 for(size_t i = 0; i < count; i++) {
169 redop->template apply<EXCL>(*static_cast<typename REDOP::LHS *>(lhs_ptr),
170 *static_cast<const typename REDOP::RHS *>(rhs_ptr));
171 lhs_ptr = static_cast<char *>(lhs_ptr) + lhs_stride;
172 rhs_ptr = static_cast<const char *>(rhs_ptr) + rhs_stride;
173 }
174 }
175
176 template <typename REDOP, bool EXCL>
177 void cpu_fold_wrapper(void *rhs1_ptr, size_t rhs1_stride, const void *rhs2_ptr,
178 size_t rhs2_stride, size_t count, const void *userdata)
179 {
180 const REDOP *redop = static_cast<const REDOP *>(userdata);
181 for(size_t i = 0; i < count; i++) {
182 redop->template fold<EXCL>(*static_cast<typename REDOP::RHS *>(rhs1_ptr),
183 *static_cast<const typename REDOP::RHS *>(rhs2_ptr));
184 rhs1_ptr = static_cast<char *>(rhs1_ptr) + rhs1_stride;
185 rhs2_ptr = static_cast<const char *>(rhs2_ptr) + rhs2_stride;
186 }
187 }
188 }; // namespace ReductionKernels
189
190#if defined(REALM_USE_CUDA) && defined(__CUDACC__)
191 // with a cuda-capable compiler, we'll automatically add cuda reduction
192 // kernels if the REDOP class defines has_cuda_reductions AND it's true
193 // this requires a bunch of SFINAE template-fu
194 template <typename T>
195 struct HasHasCudaReductions {
196 struct YES {
197 char dummy[1];
198 };
199 struct NO {
200 char dummy[2];
201 };
202 struct AltnerativeDefinition {
203 static const bool has_cuda_reductions = false;
204 };
205 template <typename T2>
206 struct Combined : public T2, public AltnerativeDefinition {};
207 template <typename T2, T2>
208 struct CheckAmbiguous {};
209 template <typename T2>
210 static NO
211 has_member(CheckAmbiguous<const bool *, &Combined<T2>::has_cuda_reductions> *);
212 template <typename T2>
213 static YES has_member(...);
214 const static bool value = sizeof(has_member<T>(0)) == sizeof(YES);
215 };
216
217 template <typename T, bool OK>
218 struct MaybeAddCudaReductions;
219 template <typename T>
220 struct MaybeAddCudaReductions<T, false> {
221 static void if_member_exists(ReductionOpUntyped *redop){};
222 static void if_member_is_true(ReductionOpUntyped *redop){};
223 };
224 template <typename T>
225 struct MaybeAddCudaReductions<T, true> {
226 static void if_member_exists(ReductionOpUntyped *redop)
227 {
228 MaybeAddCudaReductions<T, T::has_cuda_reductions>::if_member_is_true(redop);
229 }
230 static void if_member_is_true(ReductionOpUntyped *redop)
231 {
232 Cuda::add_cuda_redop_kernels<T>(redop);
233 }
234 };
235#endif
236
237#if defined(REALM_USE_HIP) && (defined(__CUDACC__) || defined(__HIPCC__))
238 // with a hip-capable compiler, we'll automatically add hip reduction
239 // kernels if the REDOP class defines has_hip_reductions AND it's true
240 // this requires a bunch of SFINAE template-fu
241 template <typename T>
242 struct HasHasHipReductions {
243 struct YES {
244 char dummy[1];
245 };
246 struct NO {
247 char dummy[2];
248 };
249 struct AltnerativeDefinition {
250 static const bool has_hip_reductions = false;
251 };
252 template <typename T2>
253 struct Combined : public T2, public AltnerativeDefinition {};
254 template <typename T2, T2>
255 struct CheckAmbiguous {};
256 template <typename T2>
257 static NO
258 has_member(CheckAmbiguous<const bool *, &Combined<T2>::has_hip_reductions> *);
259 template <typename T2>
260 static YES has_member(...);
261 const static bool value = sizeof(has_member<T>(0)) == sizeof(YES);
262 };
263
264 template <typename T, bool OK>
265 struct MaybeAddHipReductions;
266 template <typename T>
267 struct MaybeAddHipReductions<T, false> {
268 static void if_member_exists(ReductionOpUntyped *redop){};
269 static void if_member_is_true(ReductionOpUntyped *redop){};
270 };
271 template <typename T>
272 struct MaybeAddHipReductions<T, true> {
273 static void if_member_exists(ReductionOpUntyped *redop)
274 {
275 MaybeAddHipReductions<T, T::has_hip_reductions>::if_member_is_true(redop);
276 }
277 static void if_member_is_true(ReductionOpUntyped *redop)
278 {
279 Hip::add_hip_redop_kernels<T>(redop);
280 }
281 };
282#endif
283
284 template <typename REDOP>
286 // tacked on to end of ReductionOpUntyped struct
287 typename REDOP::RHS identity_val;
289
291 : identity_val(REDOP::identity)
292 , userdata_val()
293 {
295 sizeof_lhs = sizeof(typename REDOP::LHS);
296 sizeof_rhs = sizeof(typename REDOP::RHS);
297 sizeof_userdata = sizeof(REDOP);
300 cpu_apply_excl_fn = &ReductionKernels::cpu_apply_wrapper<REDOP, true>;
301 cpu_apply_nonexcl_fn = &ReductionKernels::cpu_apply_wrapper<REDOP, false>;
302 cpu_fold_excl_fn = &ReductionKernels::cpu_fold_wrapper<REDOP, true>;
303 cpu_fold_nonexcl_fn = &ReductionKernels::cpu_fold_wrapper<REDOP, false>;
304#if defined(REALM_USE_CUDA) && defined(__CUDACC__)
305 // if REDOP defines/sets 'has_cuda_reductions' to true, try to
306 // automatically build wrappers for apply_cuda<> and fold_cuda<>
307 MaybeAddCudaReductions<REDOP, HasHasCudaReductions<REDOP>::value>::if_member_exists(
308 this);
309#endif
310#if defined(REALM_USE_HIP) && (defined(__CUDACC__) || defined(__HIPCC__))
311 // if REDOP defines/sets 'has_hip_reductions' to true, try to
312 // automatically build wrappers for apply_hip<> and fold_hip<>
313 MaybeAddHipReductions<REDOP, HasHasHipReductions<REDOP>::value>::if_member_exists(
314 this);
315#endif
316 }
317
318 protected:
319 };
320
321}; // namespace Realm
322
323 // include "redop.inl"
324
325#endif // ifndef REALM_REDOP_H
Realm::ReductionOp< REDOP > ReductionOp
Definition prealm.h:88
void cpu_apply_wrapper(void *lhs_ptr, size_t lhs_stride, const void *rhs_ptr, size_t rhs_stride, size_t count, const void *userdata)
Definition redop.h:164
void cpu_fold_wrapper(void *rhs1_ptr, size_t rhs1_stride, const void *rhs2_ptr, size_t rhs2_stride, size_t count, const void *userdata)
Definition redop.h:177
Definition activemsg.h:38
Definition redop.h:56
size_t sizeof_lhs
Definition redop.h:58
static ReductionOpUntyped * create_reduction_op(void)
Definition redop.h:137
void(* cpu_apply_nonexcl_fn)(void *lhs_ptr, size_t lhs_stride, const void *rhs_ptr, size_t rhs_stride, size_t count, const void *userdata)
Definition redop.h:67
size_t sizeof_this
Definition redop.h:57
void(* cpu_fold_nonexcl_fn)(void *rhs1_ptr, size_t rhs1_stride, const void *rhs2_ptr, size_t rhs2_stride, size_t count, const void *userdata)
Definition redop.h:71
ReductionOpUntyped()
Definition redop.h:101
static ReductionOpUntyped * clone_reduction_op(const ReductionOpUntyped *redop)
void * identity
Definition redop.h:61
void * userdata
Definition redop.h:62
size_t sizeof_rhs
Definition redop.h:59
size_t sizeof_userdata
Definition redop.h:60
void(* cpu_fold_excl_fn)(void *rhs1_ptr, size_t rhs1_stride, const void *rhs2_ptr, size_t rhs2_stride, size_t count, const void *userdata)
Definition redop.h:69
void(* cpu_apply_excl_fn)(void *lhs_ptr, size_t lhs_stride, const void *rhs_ptr, size_t rhs_stride, size_t count, const void *userdata)
Definition redop.h:65
Definition redop.h:285
ReductionOp()
Definition redop.h:290
REDOP userdata_val
Definition redop.h:288
REDOP::RHS identity_val
Definition redop.h:287