Realm
A distributed, event-based tasking library
Loading...
Searching...
No Matches
hip_redop.h
Go to the documentation of this file.
1/*
2 * Copyright 2025 Stanford University, NVIDIA Corporation
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef REALM_HIP_REDOP_H
19#define REALM_HIP_REDOP_H
20
21#include "realm/realm_config.h"
22
23#if defined(__CUDACC__) || defined(__HIPCC__)
24#ifdef REALM_USE_HIP
25#include <hip/hip_runtime.h>
26#endif
27#endif
28
29namespace Realm {
30
31 namespace Hip {
32
33#if defined(__CUDACC__) || defined(__HIPCC__)
34 // the ability to add CUDA kernels to a reduction op is only available
35 // when using a compiler that understands CUDA
36 namespace ReductionKernels {
37 template <typename REDOP, bool EXCL>
38 __global__ void apply_hip_kernel(uintptr_t lhs_base, uintptr_t lhs_stride,
39 uintptr_t rhs_base, uintptr_t rhs_stride,
40 size_t count, REDOP redop)
41 {
42 size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
43 for(size_t idx = tid; tid < count; tid += blockDim.x * gridDim.x)
44 redop.template apply_hip<EXCL>(
45 *reinterpret_cast<typename REDOP::LHS *>(lhs_base + idx * lhs_stride),
46 *reinterpret_cast<const typename REDOP::RHS *>(rhs_base +
47 idx * rhs_stride));
48 }
49
50 template <typename REDOP, bool EXCL>
51 __global__ void fold_hip_kernel(uintptr_t rhs1_base, uintptr_t rhs1_stride,
52 uintptr_t rhs2_base, uintptr_t rhs2_stride,
53 size_t count, REDOP redop)
54 {
55 size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
56 for(size_t idx = tid; tid < count; tid += blockDim.x * gridDim.x)
57 redop.template fold_hip<EXCL>(
58 *reinterpret_cast<typename REDOP::RHS *>(rhs1_base + idx * rhs1_stride),
59 *reinterpret_cast<const typename REDOP::RHS *>(rhs2_base +
60 idx * rhs2_stride));
61 }
62 }; // namespace ReductionKernels
63
64 // this helper adds the appropriate kernels for REDOP to a ReductionOpUntyped,
65 // although the latter is templated to work around circular include deps
66 template <typename REDOP, typename T /*= ReductionOpUntyped*/>
67 void add_hip_redop_kernels(T *redop)
68 {
69 // store the host proxy function pointer, as it's the same for all
70 // devices - translation to actual cudaFunction_t's happens later
71 redop->hip_apply_excl_fn =
72 reinterpret_cast<void *>(&ReductionKernels::apply_hip_kernel<REDOP, true>);
73 redop->hip_apply_nonexcl_fn =
74 reinterpret_cast<void *>(&ReductionKernels::apply_hip_kernel<REDOP, false>);
75 redop->hip_fold_excl_fn =
76 reinterpret_cast<void *>(&ReductionKernels::fold_hip_kernel<REDOP, true>);
77 redop->hip_fold_nonexcl_fn =
78 reinterpret_cast<void *>(&ReductionKernels::fold_hip_kernel<REDOP, false>);
79 }
80#endif
81
82 }; // namespace Hip
83
84}; // namespace Realm
85
86#endif
Definition activemsg.h:38