18#ifndef REALM_CUDA_REDOP_H
19#define REALM_CUDA_REDOP_H
31 template <
typename Offset_t =
size_t>
32 static __device__
inline void index_to_coords(Offset_t *coords, Offset_t index,
33 const Offset_t *extents,
34 const size_t elem_size)
39 for(
int i = 0; i < n - 1; i++) {
40 size_t div_tmp = div / extents[i];
41 coords[i] = div - div_tmp * extents[i];
45 coords[0] = coords[0] * elem_size;
48 template <
typename Offset_t =
size_t>
49 static __device__
inline size_t coords_to_index(
const Offset_t *coords,
50 const Offset_t *strides,
51 const size_t elem_size)
58 for(; d < n - 1; d++) {
68 template <
typename Offset_t =
size_t>
69 static __device__
inline size_t coords_to_index_transpose(
const Offset_t *coords,
70 const Offset_t *strides)
73 i = coords[1] * strides[0] + coords[2] * strides[1] + coords[0];
77 namespace ReductionKernelsAdvanced {
78 template <
typename REDOP,
bool EXCL>
81 size_t off = blockIdx.x * blockDim.x + threadIdx.x;
83 size_t vol = current_info.
volume;
84 size_t num_elems_rhs = current_info.
src.elem_size /
sizeof(
typename REDOP::RHS);
85 size_t redop_rhs_size =
sizeof(
typename REDOP::RHS);
86 typename REDOP::RHS *dst =
87 reinterpret_cast<typename REDOP::RHS *
>(current_info.
dst.addr);
88 typename REDOP::RHS *
src =
89 reinterpret_cast<typename REDOP::RHS *
>(current_info.
src.addr);
90 for(
size_t idx = off; idx < vol; idx += blockDim.x * gridDim.x) {
92 index_to_coords<size_t>(coords, idx, current_info.
extents, redop_rhs_size);
93 const size_t src_idx =
94 coords_to_index<size_t>(coords, current_info.
src.strides, redop_rhs_size);
95 const size_t dst_idx =
96 coords_to_index<size_t>(coords, current_info.
dst.strides, redop_rhs_size);
97 redop.template fold_cuda<EXCL>(
98 *
reinterpret_cast<typename REDOP::RHS *
>(&dst[dst_idx * num_elems_rhs]),
99 *
reinterpret_cast<const typename REDOP::RHS *
>(
100 &
src[src_idx * num_elems_rhs]));
104 template <
typename REDOP,
bool EXCL>
107 size_t off = blockIdx.x * blockDim.x + threadIdx.x;
109 size_t vol = current_info.
volume;
110 size_t num_elems_lhs = current_info.
dst.elem_size /
sizeof(
typename REDOP::LHS);
111 size_t num_elems_rhs = current_info.
src.elem_size /
sizeof(
typename REDOP::RHS);
112 size_t redop_lhs_size =
sizeof(
typename REDOP::LHS);
113 size_t redop_rhs_size =
sizeof(
typename REDOP::RHS);
114 typename REDOP::LHS *dst =
115 reinterpret_cast<typename REDOP::LHS *
>(current_info.
dst.addr);
116 typename REDOP::RHS *
src =
117 reinterpret_cast<typename REDOP::RHS *
>(current_info.
src.addr);
118 for(
size_t idx = off; idx < vol; idx += blockDim.x * gridDim.x) {
120 index_to_coords<size_t>(coords, idx, current_info.
extents, redop_rhs_size);
121 const size_t src_idx =
122 coords_to_index<size_t>(coords, current_info.
src.strides, redop_rhs_size);
123 const size_t dst_idx =
124 coords_to_index<size_t>(coords, current_info.
dst.strides, redop_lhs_size);
125 redop.template apply_cuda<EXCL>(
126 *
reinterpret_cast<typename REDOP::LHS *
>(&(dst[dst_idx * num_elems_lhs])),
127 *
reinterpret_cast<const typename REDOP::RHS *
>(
128 &
src[src_idx * num_elems_rhs]));
133 namespace ReductionKernelsTranspose {
134 template <
typename REDOP,
bool EXCL>
138 size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
139 size_t vol = current_info.
volume;
140 size_t num_elems = current_info.
elem_size /
sizeof(
typename REDOP::RHS);
141 typename REDOP::RHS *dst =
142 reinterpret_cast<typename REDOP::RHS *
>(current_info.
dst);
143 typename REDOP::RHS *
src =
144 reinterpret_cast<typename REDOP::RHS *
>(current_info.
src);
145 for(
size_t idx = offset; idx < vol; idx += blockDim.x * gridDim.x) {
147 index_to_coords<size_t>(coords, idx, current_info.
extents, 1);
148 const size_t src_idx =
149 coords_to_index_transpose<size_t>(coords, current_info.
src_strides);
150 const size_t dst_idx =
151 coords_to_index_transpose<size_t>(coords, current_info.
dst_strides);
152 redop.template fold_cuda<EXCL>(
153 *
reinterpret_cast<typename REDOP::RHS *
>(&dst[dst_idx * num_elems]),
154 *
reinterpret_cast<const typename REDOP::RHS *
>(&
src[src_idx * num_elems]));
158 template <
typename REDOP,
bool EXCL>
162 const size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
163 size_t vol = current_info.
volume;
164 size_t num_elems = current_info.
elem_size /
sizeof(
typename REDOP::RHS);
165 typename REDOP::LHS *dst =
166 reinterpret_cast<typename REDOP::LHS *
>(current_info.
dst);
167 typename REDOP::RHS *
src =
168 reinterpret_cast<typename REDOP::RHS *
>(current_info.
src);
170 for(
size_t idx = offset; idx < vol; idx += blockDim.x * gridDim.x) {
172 index_to_coords<size_t>(coords, idx, current_info.
extents, 1);
173 const size_t src_idx =
174 coords_to_index_transpose<size_t>(coords, current_info.
src_strides);
175 const size_t dst_idx =
176 coords_to_index_transpose<size_t>(coords, current_info.
dst_strides);
177 redop.template apply_cuda<EXCL>(
178 *
reinterpret_cast<typename REDOP::LHS *
>(&dst[dst_idx * num_elems]),
179 *
reinterpret_cast<const typename REDOP::RHS *
>(&
src[src_idx * num_elems]));
186 namespace ReductionKernels {
188 template <
typename LHS,
typename RHS,
typename F>
189 __device__
void iter_cuda_kernel(uintptr_t lhs_base, uintptr_t lhs_stride,
190 uintptr_t rhs_base, uintptr_t rhs_stride,
191 size_t count, F func,
void *context =
nullptr)
193 const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
194 for(
size_t idx = tid; idx < count; idx += blockDim.x * gridDim.x) {
195 (*func)(*
reinterpret_cast<LHS *
>(lhs_base + idx * lhs_stride),
196 *
reinterpret_cast<const RHS *
>(rhs_base + idx * rhs_stride), context);
200 template <
typename REDOP,
bool EXCL>
201 __device__
void redop_apply_wrapper(
typename REDOP::LHS &lhs,
202 const typename REDOP::RHS &rhs,
void *context)
204 REDOP &redop = *
reinterpret_cast<REDOP *
>(context);
205 redop.template apply_cuda<EXCL>(lhs, rhs);
207 template <
typename REDOP,
bool EXCL>
208 __device__
void redop_fold_wrapper(
typename REDOP::RHS &rhs1,
209 const typename REDOP::RHS &rhs2,
void *context)
211 REDOP &redop = *
reinterpret_cast<REDOP *
>(context);
212 redop.template fold_cuda<EXCL>(rhs1, rhs2);
215 template <
typename REDOP,
bool EXCL>
216 __global__
void apply_cuda_kernel(uintptr_t lhs_base, uintptr_t lhs_stride,
217 uintptr_t rhs_base, uintptr_t rhs_stride,
218 size_t count, REDOP redop)
220 iter_cuda_kernel<typename REDOP::LHS, typename REDOP::RHS>(
221 lhs_base, lhs_stride, rhs_base, rhs_stride, count,
222 redop_apply_wrapper<REDOP, EXCL>, (
void *)&redop);
225 template <
typename REDOP,
bool EXCL>
226 __global__
void fold_cuda_kernel(uintptr_t rhs1_base, uintptr_t rhs1_stride,
227 uintptr_t rhs2_base, uintptr_t rhs2_stride,
228 size_t count, REDOP redop)
230 iter_cuda_kernel<typename REDOP::RHS, typename REDOP::RHS>(
231 rhs1_base, rhs1_stride, rhs2_base, rhs2_stride, count,
232 redop_fold_wrapper<REDOP, EXCL>, (
void *)&redop);
236 template <
typename REDOP,
typename T >
237 void add_cuda_redop_kernels_advanced(T *redop)
239 redop->cuda_apply_excl_fn_advanced =
reinterpret_cast<void *
>(
240 &ReductionKernelsAdvanced::apply_cuda_kernel<REDOP, true>);
241 redop->cuda_apply_nonexcl_fn_advanced =
reinterpret_cast<void *
>(
242 &ReductionKernelsAdvanced::apply_cuda_kernel<REDOP, false>);
243 redop->cuda_fold_excl_fn_advanced =
reinterpret_cast<void *
>(
244 &ReductionKernelsAdvanced::fold_cuda_kernel<REDOP, true>);
245 redop->cuda_fold_nonexcl_fn_advanced =
reinterpret_cast<void *
>(
246 &ReductionKernelsAdvanced::fold_cuda_kernel<REDOP, false>);
248 redop->cuda_apply_excl_fn_transpose =
reinterpret_cast<void *
>(
249 &ReductionKernelsTranspose::apply_cuda_kernel<REDOP, true>);
251 redop->cuda_apply_nonexcl_fn_transpose =
reinterpret_cast<void *
>(
252 &ReductionKernelsTranspose::apply_cuda_kernel<REDOP, false>);
254 redop->cuda_fold_excl_fn_transpose =
reinterpret_cast<void *
>(
255 &ReductionKernelsTranspose::fold_cuda_kernel<REDOP, true>);
257 redop->cuda_fold_nonexcl_fn_transpose =
reinterpret_cast<void *
>(
258 &ReductionKernelsTranspose::fold_cuda_kernel<REDOP, false>);
264 template <
typename REDOP,
typename T >
265 void add_cuda_redop_kernels(T *redop)
269 redop->cuda_apply_excl_fn =
270 reinterpret_cast<void *
>(&ReductionKernels::apply_cuda_kernel<REDOP, true>);
271 redop->cuda_apply_nonexcl_fn =
272 reinterpret_cast<void *
>(&ReductionKernels::apply_cuda_kernel<REDOP, false>);
273 redop->cuda_fold_excl_fn =
274 reinterpret_cast<void *
>(&ReductionKernels::fold_cuda_kernel<REDOP, true>);
275 redop->cuda_fold_nonexcl_fn =
276 reinterpret_cast<void *
>(&ReductionKernels::fold_cuda_kernel<REDOP, false>);
277 add_cuda_redop_kernels_advanced<REDOP, T>(redop);
282 typedef cudaError_t (*PFN_cudaLaunchKernel)(
const void *func, dim3 gridDim,
283 dim3 blockDim,
void **args,
285 PFN_cudaLaunchKernel launch_fn =
286 static_cast<PFN_cudaLaunchKernel
>(cudaLaunchKernel);
287 redop->cudaLaunchKernel_fn =
reinterpret_cast<void *
>(launch_fn);
288#if CUDART_VERSION >= 11000
289 typedef cudaError_t (*PFN_cudaGetFuncBySymbol)(cudaFunction_t * functionPtr,
290 const void *symbolPtr);
291 PFN_cudaGetFuncBySymbol symbol_fn =
292 static_cast<PFN_cudaGetFuncBySymbol
>(cudaGetFuncBySymbol);
293 redop->cudaGetFuncBySymbol_fn =
reinterpret_cast<void *
>(symbol_fn);
#define cudaStream_t
Definition hip_cuda.h:27
#define cudaError_t
Definition hip_cuda.h:25
Definition activemsg.h:38
Definition cuda_reduc.h:58
AffineReducPair< N > subrects[MAX_RECTS]
Definition cuda_reduc.h:64
Definition cuda_reduc.h:47
size_t volume
Definition cuda_reduc.h:54
size_t extents[N]
Definition cuda_reduc.h:51
AffineReducSubRect< N > dst
Definition cuda_reduc.h:49
AffineReducSubRect< N > src
Definition cuda_reduc.h:48
Definition cuda_reduc.h:27
size_t elem_size
Definition cuda_reduc.h:34
Offset_t dst_strides[2]
Definition cuda_reduc.h:30
uintptr_t dst
Definition cuda_reduc.h:31
size_t volume
Definition cuda_reduc.h:33
Offset_t extents[3]
Definition cuda_reduc.h:28
Offset_t src_strides[2]
Definition cuda_reduc.h:29
uintptr_t src
Definition cuda_reduc.h:32
NodeID src
Definition ucp_internal.h:1