Realm
A distributed, event-based tasking library
Loading...
Searching...
No Matches
redop.h
Go to the documentation of this file.
1/*
2 * Copyright 2025 Stanford University, NVIDIA Corporation
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18// reduction ops for Realm
19
20#ifndef REALM_REDOP_H
21#define REALM_REDOP_H
22
23#include "realm/realm_config.h"
24
25#ifdef REALM_USE_CUDA
27#endif
28
29#ifdef REALM_USE_HIP
30#include "realm/hip/hip_redop.h"
31#endif
32
33#include <cstddef>
34#include <type_traits>
35
36namespace Realm {
37
38 // a reduction op needs to look like this
39#ifdef NOT_REALLY_CODE
40 class MyReductionOp {
41 public:
42 typedef int LHS;
43 typedef int RHS;
44
45 void apply(LHS &lhs, RHS rhs) const;
46
47 // both of these are optional
48 static const RHS identity;
49 void fold(RHS &rhs1, RHS rhs2) const;
50 };
51#endif
52
53 template <typename REDOP>
54 struct ReductionOp;
55
57 size_t sizeof_this; // includes any identity val or user data after struct
58 size_t sizeof_lhs;
59 size_t sizeof_rhs;
60 size_t sizeof_userdata; // extra data supplied to apply/fold
61 void *identity; // if non-null, points into same object
62 void *userdata; // if non-null, points into same object
63
64 // CPU apply/fold functions - tolerate strided src/dst
65 void (*cpu_apply_excl_fn)(void *lhs_ptr, size_t lhs_stride, const void *rhs_ptr,
66 size_t rhs_stride, size_t count, const void *userdata);
67 void (*cpu_apply_nonexcl_fn)(void *lhs_ptr, size_t lhs_stride, const void *rhs_ptr,
68 size_t rhs_stride, size_t count, const void *userdata);
69 void (*cpu_fold_excl_fn)(void *rhs1_ptr, size_t rhs1_stride, const void *rhs2_ptr,
70 size_t rhs2_stride, size_t count, const void *userdata);
71 void (*cpu_fold_nonexcl_fn)(void *rhs1_ptr, size_t rhs1_stride, const void *rhs2_ptr,
72 size_t rhs2_stride, size_t count, const void *userdata);
73
74#ifdef REALM_USE_CUDA
75 // CUDA kernels for apply/fold - these are not actually the functions,
76 // but just information (e.g. host wrapper fnptr) that can be used
77 // to look up the actual kernels
78 void *cuda_apply_excl_fn, *cuda_apply_nonexcl_fn;
79 void *cuda_fold_excl_fn, *cuda_fold_nonexcl_fn;
80
81 // These function pointers make the connection to the app's runtime
82 // instance in order to properly translate and capture the correct
83 // function to launch.
84 // The runtime function pointer to launch these shadow symbols
85 void *cudaLaunchKernel_fn;
86 // The runtime function pointer to translate the host shadow symbol to a driver
87 // function
88 void *cudaGetFuncBySymbol_fn;
89#endif
90#ifdef REALM_USE_HIP
91 // HIP kernels for apply/fold - these are not actually the functions,
92 // but just information (e.g. host wrapper fnptr) that can be used
93 // to look up the actual kernels
94 void *hip_apply_excl_fn, *hip_apply_nonexcl_fn;
95 void *hip_fold_excl_fn, *hip_fold_nonexcl_fn;
96#endif
97
100 , sizeof_lhs(0)
101 , sizeof_rhs(0)
102 , sizeof_userdata(0)
103 , identity(0)
104 , userdata(0)
109#ifdef REALM_USE_CUDA
110 , cuda_apply_excl_fn(0)
111 , cuda_apply_nonexcl_fn(0)
112 , cuda_fold_excl_fn(0)
113 , cuda_fold_nonexcl_fn(0)
114 , cudaLaunchKernel_fn(0)
115 , cudaGetFuncBySymbol_fn(0)
116#endif
117#ifdef REALM_USE_HIP
118 , hip_apply_excl_fn(0)
119 , hip_apply_nonexcl_fn(0)
120 , hip_fold_excl_fn(0)
121 , hip_fold_nonexcl_fn(0)
122#endif
123 {}
124
125 template <class REDOP>
127 {
128 // reduction ops are allowed to use helper constructors, but are
129 // type-erased inside of realm, so must be trivially copyable and
130 // trivially destructible (we will use malloc/memcpy/free instead
131 // of new/delete)
132 // FIXME:
133 // TODO:
134 // Re-enable when the examples used in legion CI are fixed.
135#if 0
136 static_assert(std::is_trivially_copyable<ReductionOp<REDOP>>::value &&
137 std::is_trivially_destructible<ReductionOp<REDOP>>::value,
138 "ReductionOp<REDOP> must be trivially copyable/destructible");
139#endif
140 void *ptr = malloc(sizeof(ReductionOp<REDOP>));
141 if(ptr) {
142 ReductionOpUntyped *redop = new(ptr) ReductionOp<REDOP>;
143 return redop;
144 } else
145 return nullptr;
146 }
147
149 };
150
151 namespace ReductionKernels {
152 template <typename REDOP, bool EXCL>
153 void cpu_apply_wrapper(void *lhs_ptr, size_t lhs_stride, const void *rhs_ptr,
154 size_t rhs_stride, size_t count, const void *userdata)
155 {
156 const REDOP *redop = static_cast<const REDOP *>(userdata);
157 for(size_t i = 0; i < count; i++) {
158 redop->template apply<EXCL>(*static_cast<typename REDOP::LHS *>(lhs_ptr),
159 *static_cast<const typename REDOP::RHS *>(rhs_ptr));
160 lhs_ptr = static_cast<char *>(lhs_ptr) + lhs_stride;
161 rhs_ptr = static_cast<const char *>(rhs_ptr) + rhs_stride;
162 }
163 }
164
165 template <typename REDOP, bool EXCL>
166 void cpu_fold_wrapper(void *rhs1_ptr, size_t rhs1_stride, const void *rhs2_ptr,
167 size_t rhs2_stride, size_t count, const void *userdata)
168 {
169 const REDOP *redop = static_cast<const REDOP *>(userdata);
170 for(size_t i = 0; i < count; i++) {
171 redop->template fold<EXCL>(*static_cast<typename REDOP::RHS *>(rhs1_ptr),
172 *static_cast<const typename REDOP::RHS *>(rhs2_ptr));
173 rhs1_ptr = static_cast<char *>(rhs1_ptr) + rhs1_stride;
174 rhs2_ptr = static_cast<const char *>(rhs2_ptr) + rhs2_stride;
175 }
176 }
177 }; // namespace ReductionKernels
178
179#if defined(REALM_USE_CUDA) && defined(__CUDACC__)
180 // with a cuda-capable compiler, we'll automatically add cuda reduction
181 // kernels if the REDOP class defines has_cuda_reductions AND it's true
182 // this requires a bunch of SFINAE template-fu
183 template <typename T>
184 struct HasHasCudaReductions {
185 struct YES {
186 char dummy[1];
187 };
188 struct NO {
189 char dummy[2];
190 };
191 struct AltnerativeDefinition {
192 static const bool has_cuda_reductions = false;
193 };
194 template <typename T2>
195 struct Combined : public T2, public AltnerativeDefinition {};
196 template <typename T2, T2>
197 struct CheckAmbiguous {};
198 template <typename T2>
199 static NO
200 has_member(CheckAmbiguous<const bool *, &Combined<T2>::has_cuda_reductions> *);
201 template <typename T2>
202 static YES has_member(...);
203 const static bool value = sizeof(has_member<T>(0)) == sizeof(YES);
204 };
205
206 template <typename T, bool OK>
207 struct MaybeAddCudaReductions;
208 template <typename T>
209 struct MaybeAddCudaReductions<T, false> {
210 static void if_member_exists(ReductionOpUntyped *redop){};
211 static void if_member_is_true(ReductionOpUntyped *redop){};
212 };
213 template <typename T>
214 struct MaybeAddCudaReductions<T, true> {
215 static void if_member_exists(ReductionOpUntyped *redop)
216 {
217 MaybeAddCudaReductions<T, T::has_cuda_reductions>::if_member_is_true(redop);
218 }
219 static void if_member_is_true(ReductionOpUntyped *redop)
220 {
221 Cuda::add_cuda_redop_kernels<T>(redop);
222 }
223 };
224#endif
225
226#if defined(REALM_USE_HIP) && (defined(__CUDACC__) || defined(__HIPCC__))
227 // with a hip-capable compiler, we'll automatically add hip reduction
228 // kernels if the REDOP class defines has_hip_reductions AND it's true
229 // this requires a bunch of SFINAE template-fu
230 template <typename T>
231 struct HasHasHipReductions {
232 struct YES {
233 char dummy[1];
234 };
235 struct NO {
236 char dummy[2];
237 };
238 struct AltnerativeDefinition {
239 static const bool has_hip_reductions = false;
240 };
241 template <typename T2>
242 struct Combined : public T2, public AltnerativeDefinition {};
243 template <typename T2, T2>
244 struct CheckAmbiguous {};
245 template <typename T2>
246 static NO
247 has_member(CheckAmbiguous<const bool *, &Combined<T2>::has_hip_reductions> *);
248 template <typename T2>
249 static YES has_member(...);
250 const static bool value = sizeof(has_member<T>(0)) == sizeof(YES);
251 };
252
253 template <typename T, bool OK>
254 struct MaybeAddHipReductions;
255 template <typename T>
256 struct MaybeAddHipReductions<T, false> {
257 static void if_member_exists(ReductionOpUntyped *redop){};
258 static void if_member_is_true(ReductionOpUntyped *redop){};
259 };
260 template <typename T>
261 struct MaybeAddHipReductions<T, true> {
262 static void if_member_exists(ReductionOpUntyped *redop)
263 {
264 MaybeAddHipReductions<T, T::has_hip_reductions>::if_member_is_true(redop);
265 }
266 static void if_member_is_true(ReductionOpUntyped *redop)
267 {
268 Hip::add_hip_redop_kernels<T>(redop);
269 }
270 };
271#endif
272
273 template <typename REDOP>
275 // tacked on to end of ReductionOpUntyped struct
276 typename REDOP::RHS identity_val;
278
280 : identity_val(REDOP::identity)
281 , userdata_val()
282 {
284 sizeof_lhs = sizeof(typename REDOP::LHS);
285 sizeof_rhs = sizeof(typename REDOP::RHS);
286 sizeof_userdata = sizeof(REDOP);
289 cpu_apply_excl_fn = &ReductionKernels::cpu_apply_wrapper<REDOP, true>;
290 cpu_apply_nonexcl_fn = &ReductionKernels::cpu_apply_wrapper<REDOP, false>;
291 cpu_fold_excl_fn = &ReductionKernels::cpu_fold_wrapper<REDOP, true>;
292 cpu_fold_nonexcl_fn = &ReductionKernels::cpu_fold_wrapper<REDOP, false>;
293#if defined(REALM_USE_CUDA) && defined(__CUDACC__)
294 // if REDOP defines/sets 'has_cuda_reductions' to true, try to
295 // automatically build wrappers for apply_cuda<> and fold_cuda<>
296 MaybeAddCudaReductions<REDOP, HasHasCudaReductions<REDOP>::value>::if_member_exists(
297 this);
298#endif
299#if defined(REALM_USE_HIP) && (defined(__CUDACC__) || defined(__HIPCC__))
300 // if REDOP defines/sets 'has_hip_reductions' to true, try to
301 // automatically build wrappers for apply_hip<> and fold_hip<>
302 MaybeAddHipReductions<REDOP, HasHasHipReductions<REDOP>::value>::if_member_exists(
303 this);
304#endif
305 }
306
307 protected:
308 };
309
310}; // namespace Realm
311
312 // include "redop.inl"
313
314#endif // ifndef REALM_REDOP_H
Realm::ReductionOp< REDOP > ReductionOp
Definition prealm.h:88
void cpu_apply_wrapper(void *lhs_ptr, size_t lhs_stride, const void *rhs_ptr, size_t rhs_stride, size_t count, const void *userdata)
Definition redop.h:153
void cpu_fold_wrapper(void *rhs1_ptr, size_t rhs1_stride, const void *rhs2_ptr, size_t rhs2_stride, size_t count, const void *userdata)
Definition redop.h:166
Definition activemsg.h:38
Definition redop.h:56
size_t sizeof_lhs
Definition redop.h:58
static ReductionOpUntyped * create_reduction_op(void)
Definition redop.h:126
void(* cpu_apply_nonexcl_fn)(void *lhs_ptr, size_t lhs_stride, const void *rhs_ptr, size_t rhs_stride, size_t count, const void *userdata)
Definition redop.h:67
size_t sizeof_this
Definition redop.h:57
void(* cpu_fold_nonexcl_fn)(void *rhs1_ptr, size_t rhs1_stride, const void *rhs2_ptr, size_t rhs2_stride, size_t count, const void *userdata)
Definition redop.h:71
ReductionOpUntyped()
Definition redop.h:98
static ReductionOpUntyped * clone_reduction_op(const ReductionOpUntyped *redop)
void * identity
Definition redop.h:61
void * userdata
Definition redop.h:62
size_t sizeof_rhs
Definition redop.h:59
size_t sizeof_userdata
Definition redop.h:60
void(* cpu_fold_excl_fn)(void *rhs1_ptr, size_t rhs1_stride, const void *rhs2_ptr, size_t rhs2_stride, size_t count, const void *userdata)
Definition redop.h:69
void(* cpu_apply_excl_fn)(void *lhs_ptr, size_t lhs_stride, const void *rhs_ptr, size_t rhs_stride, size_t count, const void *userdata)
Definition redop.h:65
Definition redop.h:274
ReductionOp()
Definition redop.h:279
REDOP userdata_val
Definition redop.h:277
REDOP::RHS identity_val
Definition redop.h:276