Realm
A distributed, event-based tasking library
Loading...
Searching...
No Matches
cuda_redop.h
Go to the documentation of this file.
1/*
2 * Copyright 2025 Stanford University, NVIDIA Corporation
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef REALM_CUDA_REDOP_H
19#define REALM_CUDA_REDOP_H
20
21#include "realm/realm_config.h"
22#include "cuda_reduc.h"
23#include <stddef.h>
24#include <stdint.h>
25
26namespace Realm {
27
28 namespace Cuda {
29
30#ifdef __CUDACC__
31 template <typename Offset_t = size_t>
32 static __device__ inline void index_to_coords(Offset_t *coords, Offset_t index,
33 const Offset_t *extents,
34 const size_t elem_size)
35 {
36 size_t div = index;
37 const unsigned n = 3;
38#pragma unroll
39 for(int i = 0; i < n - 1; i++) {
40 size_t div_tmp = div / extents[i];
41 coords[i] = div - div_tmp * extents[i];
42 div = div_tmp;
43 }
44 coords[n - 1] = div;
45 coords[0] = coords[0] * elem_size;
46 }
47
48 template <typename Offset_t = size_t>
49 static __device__ inline size_t coords_to_index(const Offset_t *coords,
50 const Offset_t *strides,
51 const size_t elem_size)
52 {
53 size_t i = 0;
54 size_t vol = 1;
55 int d = 0;
56 const unsigned n = 3;
57#pragma unroll
58 for(; d < n - 1; d++) {
59 i += vol * coords[d];
60 vol *= strides[d];
61 }
62
63 i += vol * coords[d];
64 i = i / elem_size;
65 return i;
66 }
67
68 template <typename Offset_t = size_t>
69 static __device__ inline size_t coords_to_index_transpose(const Offset_t *coords,
70 const Offset_t *strides)
71 {
72 size_t i = 0;
73 i = coords[1] * strides[0] + coords[2] * strides[1] + coords[0];
74 return i;
75 }
76
77 namespace ReductionKernelsAdvanced {
78 template <typename REDOP, bool EXCL>
79 __global__ void fold_cuda_kernel(Realm::Cuda::AffineReducInfo<3> info, REDOP redop)
80 {
81 size_t off = blockIdx.x * blockDim.x + threadIdx.x;
82 Realm::Cuda::AffineReducPair<3> &current_info = info.subrects[0];
83 size_t vol = current_info.volume;
84 size_t num_elems_rhs = current_info.src.elem_size / sizeof(typename REDOP::RHS);
85 size_t redop_rhs_size = sizeof(typename REDOP::RHS);
86 typename REDOP::RHS *dst =
87 reinterpret_cast<typename REDOP::RHS *>(current_info.dst.addr);
88 typename REDOP::RHS *src =
89 reinterpret_cast<typename REDOP::RHS *>(current_info.src.addr);
90 for(size_t idx = off; idx < vol; idx += blockDim.x * gridDim.x) {
91 size_t coords[3];
92 index_to_coords<size_t>(coords, idx, current_info.extents, redop_rhs_size);
93 const size_t src_idx =
94 coords_to_index<size_t>(coords, current_info.src.strides, redop_rhs_size);
95 const size_t dst_idx =
96 coords_to_index<size_t>(coords, current_info.dst.strides, redop_rhs_size);
97 redop.template fold_cuda<EXCL>(
98 *reinterpret_cast<typename REDOP::RHS *>(&dst[dst_idx * num_elems_rhs]),
99 *reinterpret_cast<const typename REDOP::RHS *>(
100 &src[src_idx * num_elems_rhs]));
101 }
102 }
103
104 template <typename REDOP, bool EXCL>
105 __global__ void apply_cuda_kernel(Realm::Cuda::AffineReducInfo<3> info, REDOP redop)
106 {
107 size_t off = blockIdx.x * blockDim.x + threadIdx.x;
108 Realm::Cuda::AffineReducPair<3> &current_info = info.subrects[0];
109 size_t vol = current_info.volume;
110 size_t num_elems_lhs = current_info.dst.elem_size / sizeof(typename REDOP::LHS);
111 size_t num_elems_rhs = current_info.src.elem_size / sizeof(typename REDOP::RHS);
112 size_t redop_lhs_size = sizeof(typename REDOP::LHS);
113 size_t redop_rhs_size = sizeof(typename REDOP::RHS);
114 typename REDOP::LHS *dst =
115 reinterpret_cast<typename REDOP::LHS *>(current_info.dst.addr);
116 typename REDOP::RHS *src =
117 reinterpret_cast<typename REDOP::RHS *>(current_info.src.addr);
118 for(size_t idx = off; idx < vol; idx += blockDim.x * gridDim.x) {
119 size_t coords[3];
120 index_to_coords<size_t>(coords, idx, current_info.extents, redop_rhs_size);
121 const size_t src_idx =
122 coords_to_index<size_t>(coords, current_info.src.strides, redop_rhs_size);
123 const size_t dst_idx =
124 coords_to_index<size_t>(coords, current_info.dst.strides, redop_lhs_size);
125 redop.template apply_cuda<EXCL>(
126 *reinterpret_cast<typename REDOP::LHS *>(&(dst[dst_idx * num_elems_lhs])),
127 *reinterpret_cast<const typename REDOP::RHS *>(
128 &src[src_idx * num_elems_rhs]));
129 }
130 }
131 }; // namespace ReductionKernelsAdvanced
132
133 namespace ReductionKernelsTranspose {
134 template <typename REDOP, bool EXCL>
135 __global__ void fold_cuda_kernel(Realm::Cuda::MemReducInfo<size_t> current_info,
136 REDOP redop)
137 {
138 size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
139 size_t vol = current_info.volume;
140 size_t num_elems = current_info.elem_size / sizeof(typename REDOP::RHS);
141 typename REDOP::RHS *dst =
142 reinterpret_cast<typename REDOP::RHS *>(current_info.dst);
143 typename REDOP::RHS *src =
144 reinterpret_cast<typename REDOP::RHS *>(current_info.src);
145 for(size_t idx = offset; idx < vol; idx += blockDim.x * gridDim.x) {
146 size_t coords[3];
147 index_to_coords<size_t>(coords, idx, current_info.extents, 1);
148 const size_t src_idx =
149 coords_to_index_transpose<size_t>(coords, current_info.src_strides);
150 const size_t dst_idx =
151 coords_to_index_transpose<size_t>(coords, current_info.dst_strides);
152 redop.template fold_cuda<EXCL>(
153 *reinterpret_cast<typename REDOP::RHS *>(&dst[dst_idx * num_elems]),
154 *reinterpret_cast<const typename REDOP::RHS *>(&src[src_idx * num_elems]));
155 }
156 }
157
158 template <typename REDOP, bool EXCL>
159 __global__ void apply_cuda_kernel(Realm::Cuda::MemReducInfo<size_t> current_info,
160 REDOP redop)
161 {
162 const size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
163 size_t vol = current_info.volume;
164 size_t num_elems = current_info.elem_size / sizeof(typename REDOP::RHS);
165 typename REDOP::LHS *dst =
166 reinterpret_cast<typename REDOP::LHS *>(current_info.dst);
167 typename REDOP::RHS *src =
168 reinterpret_cast<typename REDOP::RHS *>(current_info.src);
169
170 for(size_t idx = offset; idx < vol; idx += blockDim.x * gridDim.x) {
171 size_t coords[3];
172 index_to_coords<size_t>(coords, idx, current_info.extents, 1);
173 const size_t src_idx =
174 coords_to_index_transpose<size_t>(coords, current_info.src_strides);
175 const size_t dst_idx =
176 coords_to_index_transpose<size_t>(coords, current_info.dst_strides);
177 redop.template apply_cuda<EXCL>(
178 *reinterpret_cast<typename REDOP::LHS *>(&dst[dst_idx * num_elems]),
179 *reinterpret_cast<const typename REDOP::RHS *>(&src[src_idx * num_elems]));
180 }
181 }
182 }; // namespace ReductionKernelsTranspose
183
184 // the ability to add CUDA kernels to a reduction op is only available
185 // when using a compiler that understands CUDA
186 namespace ReductionKernels {
187
188 template <typename LHS, typename RHS, typename F>
189 __device__ void iter_cuda_kernel(uintptr_t lhs_base, uintptr_t lhs_stride,
190 uintptr_t rhs_base, uintptr_t rhs_stride,
191 size_t count, F func, void *context = nullptr)
192 {
193 const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
194 for(size_t idx = tid; idx < count; idx += blockDim.x * gridDim.x) {
195 (*func)(*reinterpret_cast<LHS *>(lhs_base + idx * lhs_stride),
196 *reinterpret_cast<const RHS *>(rhs_base + idx * rhs_stride), context);
197 }
198 }
199
200 template <typename REDOP, bool EXCL>
201 __device__ void redop_apply_wrapper(typename REDOP::LHS &lhs,
202 const typename REDOP::RHS &rhs, void *context)
203 {
204 REDOP &redop = *reinterpret_cast<REDOP *>(context);
205 redop.template apply_cuda<EXCL>(lhs, rhs);
206 }
207 template <typename REDOP, bool EXCL>
208 __device__ void redop_fold_wrapper(typename REDOP::RHS &rhs1,
209 const typename REDOP::RHS &rhs2, void *context)
210 {
211 REDOP &redop = *reinterpret_cast<REDOP *>(context);
212 redop.template fold_cuda<EXCL>(rhs1, rhs2);
213 }
214
215 template <typename REDOP, bool EXCL>
216 __global__ void apply_cuda_kernel(uintptr_t lhs_base, uintptr_t lhs_stride,
217 uintptr_t rhs_base, uintptr_t rhs_stride,
218 size_t count, REDOP redop)
219 {
220 iter_cuda_kernel<typename REDOP::LHS, typename REDOP::RHS>(
221 lhs_base, lhs_stride, rhs_base, rhs_stride, count,
222 redop_apply_wrapper<REDOP, EXCL>, (void *)&redop);
223 }
224
225 template <typename REDOP, bool EXCL>
226 __global__ void fold_cuda_kernel(uintptr_t rhs1_base, uintptr_t rhs1_stride,
227 uintptr_t rhs2_base, uintptr_t rhs2_stride,
228 size_t count, REDOP redop)
229 {
230 iter_cuda_kernel<typename REDOP::RHS, typename REDOP::RHS>(
231 rhs1_base, rhs1_stride, rhs2_base, rhs2_stride, count,
232 redop_fold_wrapper<REDOP, EXCL>, (void *)&redop);
233 }
234 }; // namespace ReductionKernels
235
236 template <typename REDOP, typename T /*= ReductionOpUntyped*/>
237 void add_cuda_redop_kernels_advanced(T *redop)
238 {
239 redop->cuda_apply_excl_fn_advanced = reinterpret_cast<void *>(
240 &ReductionKernelsAdvanced::apply_cuda_kernel<REDOP, true>);
241 redop->cuda_apply_nonexcl_fn_advanced = reinterpret_cast<void *>(
242 &ReductionKernelsAdvanced::apply_cuda_kernel<REDOP, false>);
243 redop->cuda_fold_excl_fn_advanced = reinterpret_cast<void *>(
244 &ReductionKernelsAdvanced::fold_cuda_kernel<REDOP, true>);
245 redop->cuda_fold_nonexcl_fn_advanced = reinterpret_cast<void *>(
246 &ReductionKernelsAdvanced::fold_cuda_kernel<REDOP, false>);
247
248 redop->cuda_apply_excl_fn_transpose = reinterpret_cast<void *>(
249 &ReductionKernelsTranspose::apply_cuda_kernel<REDOP, true>);
250
251 redop->cuda_apply_nonexcl_fn_transpose = reinterpret_cast<void *>(
252 &ReductionKernelsTranspose::apply_cuda_kernel<REDOP, false>);
253
254 redop->cuda_fold_excl_fn_transpose = reinterpret_cast<void *>(
255 &ReductionKernelsTranspose::fold_cuda_kernel<REDOP, true>);
256
257 redop->cuda_fold_nonexcl_fn_transpose = reinterpret_cast<void *>(
258 &ReductionKernelsTranspose::fold_cuda_kernel<REDOP, false>);
259 }
260
261 // this helper adds the appropriate kernels for REDOP to a
262 // ReductionOpUntyped,
263 // although the latter is templated to work around circular include deps
264 template <typename REDOP, typename T /*= ReductionOpUntyped*/>
265 void add_cuda_redop_kernels(T *redop)
266 {
267 // store the host proxy function pointer, as it's the same for all
268 // devices - translation to actual cudaFunction_t's happens later
269 redop->cuda_apply_excl_fn =
270 reinterpret_cast<void *>(&ReductionKernels::apply_cuda_kernel<REDOP, true>);
271 redop->cuda_apply_nonexcl_fn =
272 reinterpret_cast<void *>(&ReductionKernels::apply_cuda_kernel<REDOP, false>);
273 redop->cuda_fold_excl_fn =
274 reinterpret_cast<void *>(&ReductionKernels::fold_cuda_kernel<REDOP, true>);
275 redop->cuda_fold_nonexcl_fn =
276 reinterpret_cast<void *>(&ReductionKernels::fold_cuda_kernel<REDOP, false>);
277 add_cuda_redop_kernels_advanced<REDOP, T>(redop);
278 // Store some connections to the client's runtime instance that will be
279 // used for launching the above instantiations
280 // We use static cast here for type safety, as cudart is not ABI stable,
281 // so we want to ensure the functions used here match our expectations
282 typedef cudaError_t (*PFN_cudaLaunchKernel)(const void *func, dim3 gridDim,
283 dim3 blockDim, void **args,
284 size_t sharedMem, cudaStream_t stream);
285 PFN_cudaLaunchKernel launch_fn =
286 static_cast<PFN_cudaLaunchKernel>(cudaLaunchKernel);
287 redop->cudaLaunchKernel_fn = reinterpret_cast<void *>(launch_fn);
288#if CUDART_VERSION >= 11000
289 typedef cudaError_t (*PFN_cudaGetFuncBySymbol)(cudaFunction_t * functionPtr,
290 const void *symbolPtr);
291 PFN_cudaGetFuncBySymbol symbol_fn =
292 static_cast<PFN_cudaGetFuncBySymbol>(cudaGetFuncBySymbol);
293 redop->cudaGetFuncBySymbol_fn = reinterpret_cast<void *>(symbol_fn);
294#endif
295 }
296#endif
297
298 }; // namespace Cuda
299
300}; // namespace Realm
301
302#endif
#define cudaStream_t
Definition hip_cuda.h:27
#define cudaError_t
Definition hip_cuda.h:25
Definition activemsg.h:38
Definition cuda_reduc.h:58
AffineReducPair< N > subrects[MAX_RECTS]
Definition cuda_reduc.h:64
Definition cuda_reduc.h:47
size_t volume
Definition cuda_reduc.h:54
size_t extents[N]
Definition cuda_reduc.h:51
AffineReducSubRect< N > dst
Definition cuda_reduc.h:49
AffineReducSubRect< N > src
Definition cuda_reduc.h:48
Definition cuda_reduc.h:27
size_t elem_size
Definition cuda_reduc.h:34
Offset_t dst_strides[2]
Definition cuda_reduc.h:30
uintptr_t dst
Definition cuda_reduc.h:31
size_t volume
Definition cuda_reduc.h:33
Offset_t extents[3]
Definition cuda_reduc.h:28
Offset_t src_strides[2]
Definition cuda_reduc.h:29
uintptr_t src
Definition cuda_reduc.h:32
NodeID src
Definition ucp_internal.h:1