doc/main/cuda__module_8h_source.html

/*

 * Copyright 2025 Stanford University, NVIDIA Corporation

 * SPDX-License-Identifier: Apache-2.0

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


#ifndef REALM_CUDA_H

#define REALM_CUDA_H


#include <type_traits>

#include "realm/realm_config.h"

#include "realm/module.h"

#include "realm/processor.h"

#include "realm/network.h"

#include "realm/atomics.h"


// realm/cuda_module.h is designed to be include-able even when the system

//  doesn't actually have CUDA installed, so we need to declare types that

//  are compatible with the CUDA driver and runtime APIs - we can't "extern"

//  a typedef (e.g. cudaStream_t) but we can forward declare the underlying

//  struct that those types are pointers to

struct CUstream_st; // cudaStream_t == CUstream == CUstream_st *

struct CUevent_st;

struct CUctx_st;

struct CUfunc_st;


namespace Realm {


  namespace NetworkSegmentInfo {

    // CUDA device memory - extra is a uintptr_t'd pointer to the GPU

    //  object

    static const MemoryType CudaDeviceMem = 2;


    // CUDA managed memory - extra is a uintptr_t'd pointer to _one of_

    //  the GPU objects

    static const MemoryType CudaManagedMem = 4;

  }; // namespace NetworkSegmentInfo


  namespace Cuda {


    // a running task on a CUDA processor is assigned a stream by Realm, and

    //  any work placed on this stream is automatically captured by the

    //  completion event for the task

    // when using the CUDA runtime hijack, Realm will force work launched via

    //  the runtime API to use the task's stream, but without hijack, or for

    //  code that uses the CUDA driver API, the task must explicitly request

    //  the stream that is associated with the task and place work on it to

    //  avoid more expensive forms of completion detection for the task

    // NOTE: this function will return a null pointer if called outside of a

    //  task running on a CUDA processor

    REALM_PUBLIC_API CUstream_st *get_task_cuda_stream();


    // when Realm is not using the CUDA runtime hijack to force work onto the

    //  task's stream, it conservatively uses a full context synchronization to

    //  make sure all device work launched by the task is captured by the task

    //  completion event - if a task uses `get_task_cuda_stream` and places all

    //  work on that stream, this API can be used to tell Realm on a per-task

    //  basis that full context synchronization is not required

    REALM_PUBLIC_API void set_task_ctxsync_required(bool is_required);


    // rather than using the APIs above, CUDA processors also support task

    //  implementations that are natively stream aware - if a task function uses

    //  the `Cuda::StreamAwareTaskFuncPtr` prototype below (instead of the normal

    //  `Processor::TaskFuncPtr`), the following differences apply:

    // a) it need not call `get_task_cuda_stream` because it gets the same value

    //   directly as an argument

    // b) by default, a context synchronization will NOT be performed as part of

    //   task completion detection (this can still be overridden with a call to

    //   `set_task_ctxsync_required(true)` if a task puts work outside the

    //   specified stream for some reason

    // c) if a stream-aware task has preconditions that involve device work, that

    //   work will be tied into the task's stream, but the task body may start

    //   executing BEFORE that work is complete (i.e. for correctness, all work

    //   launched by the task must be properly ordered (using the CUDA APIs)

    //   after anything already in the stream assigned to the task

    typedef void (*StreamAwareTaskFuncPtr)(const void *args, size_t arglen,

                                           const void *user_data, size_t user_data_len,

                                           Processor proc, CUstream_st *stream);


    // this is the same data structure of CUuuid

    static const size_t UUID_SIZE = 16; // bytes

    typedef char Uuid[UUID_SIZE];


    // fill in cude related info according to CUDA-capable device associated with

    // processor

    //  `p` if available and returns true, or returns false if processor is unknown,

    //  not associated with a CUDA-capable device, or information is unavailable

    REALM_PUBLIC_API bool get_cuda_device_uuid(Processor p, Uuid *uuid);


    REALM_PUBLIC_API bool get_cuda_device_id(Processor p, int *device);


    class GPU;

    class GPUWorker;

    struct GPUInfo;

    class GPUZCMemory;

    class GPUReplHeapListener;


    class CudaModuleConfig : public ModuleConfig {

      friend class CudaModule;


    protected:

      CudaModuleConfig(void);


      bool discover_resource(void);


    public:

      virtual void configure_from_cmdline(std::vector<std::string> &cmdline);


    public:

      // configurations

      size_t cfg_zc_mem_size = 64 << 20, cfg_zc_ib_size = 256 << 20;

      size_t cfg_fb_mem_size = 256 << 20, cfg_fb_ib_size = 128 << 20;

      size_t cfg_uvm_mem_size = 0;

      bool cfg_use_dynamic_fb = true;

      size_t cfg_dynfb_max_size = ~size_t(0);

      int cfg_num_gpus = 0;

      std::string cfg_gpu_idxs;

      unsigned cfg_task_streams = 12, cfg_d2d_streams = 4;

      bool cfg_use_worker_threads = false, cfg_use_shared_worker = true,

           cfg_pin_sysmem = true;

      bool cfg_fences_use_callbacks = false;

      bool cfg_suppress_hijack_warning = false;

      unsigned cfg_skip_gpu_count = 0;

      bool cfg_skip_busy_gpus = false;

      size_t cfg_min_avail_mem = 0;

      int cfg_task_legacy_sync = 0;   // 0 = no, 1 = yes

      int cfg_task_context_sync = -1; // 0 = no, 1 = yes, -1 = default (based on hijack)

      int cfg_max_ctxsync_threads = 4;

      bool cfg_lmem_resize_to_max = false;

      bool cfg_multithread_dma = false;

      size_t cfg_hostreg_limit = 1 << 30;

      int cfg_d2d_stream_priority = -1;

      bool cfg_use_cuda_ipc = true;

      int cfg_pageable_access = 0;

      bool cfg_enable_cupti = false;

      bool cfg_enable_cuhook = false;


      // resources

      bool resource_discovered = false;

      int res_num_gpus = 0;

      size_t res_min_fbmem_size = 0;

      std::vector<size_t> res_fbmem_sizes;

    };


    struct CudaRedOpDesc {

      ReductionOpID redop_id = 0;

      Processor proc = Processor::NO_PROC;

      CUfunc_st *apply_excl = nullptr;

      CUfunc_st *apply_nonexcl = nullptr;

      CUfunc_st *fold_excl = nullptr;

      CUfunc_st *fold_nonexcl = nullptr;

      // apply reduction with advanced kernels that support upto 3D multidimensional

      // affine layout

      CUfunc_st *apply_excl_advanced = nullptr;

      CUfunc_st *apply_nonexcl_advanced = nullptr;

      // fold reduction with advanced kernels that support upto 3D multidimensional

      // affine layout

      CUfunc_st *fold_excl_advanced = nullptr;

      CUfunc_st *fold_nonexcl_advanced = nullptr;

      // apply reduction with advanced kernels that support upto 3D multidimensional

      // transpose layout

      CUfunc_st *apply_excl_transpose = nullptr;

      CUfunc_st *apply_nonexcl_transpose = nullptr;

      // fold reduction with advanced kernels that support upto 3D multidimensional

      // transpose layout

      CUfunc_st *fold_excl_transpose = nullptr;

      CUfunc_st *fold_nonexcl_transpose = nullptr;

    };


    // our interface to the rest of the runtime


    class REALM_PUBLIC_API CudaModule : public Module {

    protected:

      CudaModule(RuntimeImpl *_runtime);


    public:

      virtual ~CudaModule(void);


      static ModuleConfig *create_module_config(RuntimeImpl *runtime);


      static Module *create_module(RuntimeImpl *runtime);


      // do any general initialization - this is called after all configuration is

      //  complete

      virtual void initialize(RuntimeImpl *runtime);


      // create any memories provided by this module (default == do nothing)

      //  (each new MemoryImpl should use a Memory from RuntimeImpl::next_local_memory_id)

      virtual void create_memories(RuntimeImpl *runtime);


      // create any processors provided by the module (default == do nothing)

      //  (each new ProcessorImpl should use a Processor from

      //   RuntimeImpl::next_local_processor_id)

      virtual void create_processors(RuntimeImpl *runtime);


      // create any DMA channels provided by the module (default == do nothing)

      virtual void create_dma_channels(RuntimeImpl *runtime);


      // create any code translators provided by the module (default == do nothing)

      virtual void create_code_translators(RuntimeImpl *runtime);


      // if a module has to do cleanup that involves sending messages to other

      //  nodes, this must be done in the pre-detach cleanup

      virtual void pre_detach_cleanup(void);


      // clean up any common resources created by the module - this will be called

      //  after all memories/processors/etc. have been shut down and destroyed

      virtual void cleanup(void);


      // free functions above are normally used, but these can be used directly

      //  if you already have a pointer to the CudaModule

      CUstream_st *get_task_cuda_stream();

      void set_task_ctxsync_required(bool is_required);


      Event make_realm_event(CUevent_st *cuda_event);

      Event make_realm_event(CUstream_st *cuda_stream);


      bool get_cuda_device_uuid(Processor p, Uuid *uuid) const;


      bool get_cuda_device_id(Processor p, int *device) const;


      bool get_cuda_context(Processor p, CUctx_st **context) const;


      bool register_reduction(Event &event, const CudaRedOpDesc *descs, size_t num);


    public:

      CudaModuleConfig *config;

      RuntimeImpl *runtime;


      // "global" variables live here too

      GPUWorker *shared_worker;

      std::map<GPU *, GPUWorker *> dedicated_workers;

      std::vector<GPUInfo *> gpu_info;

      std::vector<GPU *> gpus;

      void *zcmem_cpu_base, *zcib_cpu_base;

      GPUZCMemory *zcmem;

      void *uvm_base; // guaranteed to be same for CPU and GPU

      GPUZCMemory *uvmmem;

      GPUReplHeapListener *rh_listener;

      atomic<bool> initialization_complete;


      Mutex cudaipc_mutex;

      Mutex::CondVar cudaipc_condvar;

      atomic<size_t> cudaipc_responses_received{0};

      int cuda_api_version = 0;

    };


  }; // namespace Cuda


}; // namespace Realm


#include "realm/cuda/cuda_module.inl"


#endif

atomics.h

Realm::Cuda::CudaModuleConfig
Definition cuda_module.h:109

Realm::Cuda::CudaModuleConfig::CudaModuleConfig
CudaModuleConfig(void)

Realm::Cuda::CudaModuleConfig::cfg_pin_sysmem
bool cfg_pin_sysmem
Definition cuda_module.h:131

Realm::Cuda::CudaModuleConfig::res_fbmem_sizes
std::vector< size_t > res_fbmem_sizes
Definition cuda_module.h:153

Realm::Cuda::CudaModuleConfig::cfg_fences_use_callbacks
bool cfg_fences_use_callbacks
Definition cuda_module.h:132

Realm::Cuda::CudaModuleConfig::cfg_task_legacy_sync
int cfg_task_legacy_sync
Definition cuda_module.h:137

Realm::Cuda::CudaModuleConfig::cfg_zc_mem_size
size_t cfg_zc_mem_size
Definition cuda_module.h:122

Realm::Cuda::CudaModuleConfig::cfg_fb_mem_size
size_t cfg_fb_mem_size
Definition cuda_module.h:123

Realm::Cuda::CudaModuleConfig::cfg_dynfb_max_size
size_t cfg_dynfb_max_size
Definition cuda_module.h:126

Realm::Cuda::CudaModuleConfig::discover_resource
bool discover_resource(void)

Realm::Cuda::CudaModuleConfig::cfg_multithread_dma
bool cfg_multithread_dma
Definition cuda_module.h:141

Realm::Cuda::CudaModuleConfig::cfg_use_dynamic_fb
bool cfg_use_dynamic_fb
Definition cuda_module.h:125

Realm::Cuda::CudaModuleConfig::cfg_use_worker_threads
bool cfg_use_worker_threads
Definition cuda_module.h:130

Realm::Cuda::CudaModuleConfig::cfg_enable_cupti
bool cfg_enable_cupti
Definition cuda_module.h:146

Realm::Cuda::CudaModuleConfig::res_min_fbmem_size
size_t res_min_fbmem_size
Definition cuda_module.h:152

Realm::Cuda::CudaModuleConfig::cfg_d2d_streams
unsigned cfg_d2d_streams
Definition cuda_module.h:129

Realm::Cuda::CudaModuleConfig::cfg_d2d_stream_priority
int cfg_d2d_stream_priority
Definition cuda_module.h:143

Realm::Cuda::CudaModuleConfig::cfg_gpu_idxs
std::string cfg_gpu_idxs
Definition cuda_module.h:128

Realm::Cuda::CudaModuleConfig::configure_from_cmdline
virtual void configure_from_cmdline(std::vector< std::string > &cmdline)

Realm::Cuda::CudaModuleConfig::cfg_task_streams
unsigned cfg_task_streams
Definition cuda_module.h:129

Realm::Cuda::CudaModuleConfig::cfg_suppress_hijack_warning
bool cfg_suppress_hijack_warning
Definition cuda_module.h:133

Realm::Cuda::CudaModuleConfig::cfg_fb_ib_size
size_t cfg_fb_ib_size
Definition cuda_module.h:123

Realm::Cuda::CudaModuleConfig::cfg_pageable_access
int cfg_pageable_access
Definition cuda_module.h:145

Realm::Cuda::CudaModuleConfig::cfg_zc_ib_size
size_t cfg_zc_ib_size
Definition cuda_module.h:122

Realm::Cuda::CudaModuleConfig::res_num_gpus
int res_num_gpus
Definition cuda_module.h:151

Realm::Cuda::CudaModuleConfig::cfg_use_shared_worker
bool cfg_use_shared_worker
Definition cuda_module.h:130

Realm::Cuda::CudaModuleConfig::cfg_enable_cuhook
bool cfg_enable_cuhook
Definition cuda_module.h:147

Realm::Cuda::CudaModuleConfig::cfg_hostreg_limit
size_t cfg_hostreg_limit
Definition cuda_module.h:142

Realm::Cuda::CudaModuleConfig::cfg_task_context_sync
int cfg_task_context_sync
Definition cuda_module.h:138

Realm::Cuda::CudaModuleConfig::cfg_skip_busy_gpus
bool cfg_skip_busy_gpus
Definition cuda_module.h:135

Realm::Cuda::CudaModuleConfig::cfg_skip_gpu_count
unsigned cfg_skip_gpu_count
Definition cuda_module.h:134

Realm::Cuda::CudaModuleConfig::cfg_use_cuda_ipc
bool cfg_use_cuda_ipc
Definition cuda_module.h:144

Realm::Cuda::CudaModuleConfig::resource_discovered
bool resource_discovered
Definition cuda_module.h:150

Realm::Cuda::CudaModuleConfig::cfg_num_gpus
int cfg_num_gpus
Definition cuda_module.h:127

Realm::Cuda::CudaModuleConfig::cfg_lmem_resize_to_max
bool cfg_lmem_resize_to_max
Definition cuda_module.h:140

Realm::Cuda::CudaModuleConfig::cfg_max_ctxsync_threads
int cfg_max_ctxsync_threads
Definition cuda_module.h:139

Realm::Cuda::CudaModuleConfig::cfg_min_avail_mem
size_t cfg_min_avail_mem
Definition cuda_module.h:136

Realm::Cuda::CudaModuleConfig::cfg_uvm_mem_size
size_t cfg_uvm_mem_size
Definition cuda_module.h:124

Realm::Cuda::CudaModule
Definition cuda_module.h:182

Realm::Cuda::CudaModule::initialize
virtual void initialize(RuntimeImpl *runtime)

Realm::Cuda::CudaModule::create_dma_channels
virtual void create_dma_channels(RuntimeImpl *runtime)

Realm::Cuda::CudaModule::get_cuda_context
bool get_cuda_context(Processor p, CUctx_st **context) const

Realm::Cuda::CudaModule::create_memories
virtual void create_memories(RuntimeImpl *runtime)

Realm::Cuda::CudaModule::CudaModule
CudaModule(RuntimeImpl *_runtime)

Realm::Cuda::CudaModule::rh_listener
GPUReplHeapListener * rh_listener
Definition cuda_module.h:260

Realm::Cuda::CudaModule::initialization_complete
atomic< bool > initialization_complete
Definition cuda_module.h:261

Realm::Cuda::CudaModule::cudaipc_mutex
Mutex cudaipc_mutex
Definition cuda_module.h:263

Realm::Cuda::CudaModule::get_task_cuda_stream
CUstream_st * get_task_cuda_stream()

Realm::Cuda::CudaModule::config
CudaModuleConfig * config
Definition cuda_module.h:248

Realm::Cuda::CudaModule::uvmmem
GPUZCMemory * uvmmem
Definition cuda_module.h:259

Realm::Cuda::CudaModule::gpus
std::vector< GPU * > gpus
Definition cuda_module.h:255

Realm::Cuda::CudaModule::create_processors
virtual void create_processors(RuntimeImpl *runtime)

Realm::Cuda::CudaModule::zcmem
GPUZCMemory * zcmem
Definition cuda_module.h:257

Realm::Cuda::CudaModule::zcib_cpu_base
void * zcib_cpu_base
Definition cuda_module.h:256

Realm::Cuda::CudaModule::uvm_base
void * uvm_base
Definition cuda_module.h:258

Realm::Cuda::CudaModule::create_code_translators
virtual void create_code_translators(RuntimeImpl *runtime)

Realm::Cuda::CudaModule::shared_worker
GPUWorker * shared_worker
Definition cuda_module.h:252

Realm::Cuda::CudaModule::runtime
RuntimeImpl * runtime
Definition cuda_module.h:249

Realm::Cuda::CudaModule::register_reduction
bool register_reduction(Event &event, const CudaRedOpDesc *descs, size_t num)

Realm::Cuda::CudaModule::set_task_ctxsync_required
void set_task_ctxsync_required(bool is_required)

Realm::Cuda::CudaModule::create_module_config
static ModuleConfig * create_module_config(RuntimeImpl *runtime)

Realm::Cuda::CudaModule::get_cuda_device_id
bool get_cuda_device_id(Processor p, int *device) const

Realm::Cuda::CudaModule::cleanup
virtual void cleanup(void)

Realm::Cuda::CudaModule::pre_detach_cleanup
virtual void pre_detach_cleanup(void)

Realm::Cuda::CudaModule::cudaipc_condvar
Mutex::CondVar cudaipc_condvar
Definition cuda_module.h:264

Realm::Cuda::CudaModule::make_realm_event
Event make_realm_event(CUevent_st *cuda_event)
Returns a Realm::Event that will be triggered after the given cuda_event has completed.

Realm::Cuda::CudaModule::make_realm_event
Event make_realm_event(CUstream_st *cuda_stream)
Returns a Realm::Event that will be triggered after the given cuda_stream has completed it's currentl...

Realm::Cuda::CudaModule::gpu_info
std::vector< GPUInfo * > gpu_info
Definition cuda_module.h:254

Realm::Cuda::CudaModule::~CudaModule
virtual ~CudaModule(void)

Realm::Cuda::CudaModule::dedicated_workers
std::map< GPU *, GPUWorker * > dedicated_workers
Definition cuda_module.h:253

Realm::Cuda::CudaModule::get_cuda_device_uuid
bool get_cuda_device_uuid(Processor p, Uuid *uuid) const

Realm::Cuda::CudaModule::create_module
static Module * create_module(RuntimeImpl *runtime)

Realm::Cuda::GPUReplHeapListener
Definition cuda_internal.h:1134

Realm::Cuda::GPUWorker
Definition cuda_internal.h:298

Realm::Cuda::GPUZCMemory
Definition cuda_internal.h:674

Realm::Cuda::GPU
Definition cuda_internal.h:392

Realm::Event
Definition event.h:50

Realm::ModuleConfig
Definition module_config.h:32

Realm::Module
Definition module.h:42

Realm::Processor
Definition processor.h:37

Realm::Processor::NO_PROC
static const Processor NO_PROC
Definition processor.h:54

Realm::RuntimeImpl
Definition runtime_impl.h:267

Realm::UnfairCondVar
Definition mutex.h:325

Realm::UnfairMutex
Definition mutex.h:223

Realm::atomic
Definition atomics.h:31

REALM_PUBLIC_API
#define REALM_PUBLIC_API
Definition compiler_support.h:217

module.h

Realm::Cuda::get_cuda_device_id
REALM_PUBLIC_API bool get_cuda_device_id(Processor p, int *device)

Realm::Cuda::get_task_cuda_stream
REALM_PUBLIC_API CUstream_st * get_task_cuda_stream()

Realm::Cuda::set_task_ctxsync_required
REALM_PUBLIC_API void set_task_ctxsync_required(bool is_required)

Realm::Cuda::Uuid
char Uuid[UUID_SIZE]
Definition cuda_module.h:93

Realm::Cuda::get_cuda_device_uuid
REALM_PUBLIC_API bool get_cuda_device_uuid(Processor p, Uuid *uuid)

Realm::Cuda::StreamAwareTaskFuncPtr
void(* StreamAwareTaskFuncPtr)(const void *args, size_t arglen, const void *user_data, size_t user_data_len, Processor proc, CUstream_st *stream)
Definition cuda_module.h:87

Realm::NetworkSegmentInfo::MemoryType
unsigned MemoryType
Definition network.h:391

Realm
Definition activemsg.h:42

Realm::ReductionOpID
::realm_reduction_op_id_t ReductionOpID
Definition event.h:38

network.h

processor.h

realm_config.h

Realm::Cuda::CudaRedOpDesc
Definition cuda_module.h:156

Realm::Cuda::CudaRedOpDesc::fold_excl_transpose
CUfunc_st * fold_excl_transpose
Definition cuda_module.h:177

Realm::Cuda::CudaRedOpDesc::apply_nonexcl_transpose
CUfunc_st * apply_nonexcl_transpose
Definition cuda_module.h:174

Realm::Cuda::CudaRedOpDesc::fold_nonexcl_advanced
CUfunc_st * fold_nonexcl_advanced
Definition cuda_module.h:170

Realm::Cuda::CudaRedOpDesc::apply_nonexcl_advanced
CUfunc_st * apply_nonexcl_advanced
Definition cuda_module.h:166

Realm::Cuda::CudaRedOpDesc::fold_nonexcl
CUfunc_st * fold_nonexcl
Definition cuda_module.h:162

Realm::Cuda::CudaRedOpDesc::apply_excl_transpose
CUfunc_st * apply_excl_transpose
Definition cuda_module.h:173

Realm::Cuda::CudaRedOpDesc::fold_excl_advanced
CUfunc_st * fold_excl_advanced
Definition cuda_module.h:169

Realm::Cuda::CudaRedOpDesc::fold_nonexcl_transpose
CUfunc_st * fold_nonexcl_transpose
Definition cuda_module.h:178

Realm::Cuda::CudaRedOpDesc::apply_excl
CUfunc_st * apply_excl
Definition cuda_module.h:159

Realm::Cuda::CudaRedOpDesc::apply_excl_advanced
CUfunc_st * apply_excl_advanced
Definition cuda_module.h:165

Realm::Cuda::CudaRedOpDesc::fold_excl
CUfunc_st * fold_excl
Definition cuda_module.h:161

Realm::Cuda::CudaRedOpDesc::proc
Processor proc
Definition cuda_module.h:158

Realm::Cuda::CudaRedOpDesc::redop_id
ReductionOpID redop_id
Definition cuda_module.h:157

Realm::Cuda::CudaRedOpDesc::apply_nonexcl
CUfunc_st * apply_nonexcl
Definition cuda_module.h:160

Realm::Cuda::GPUInfo
Definition cuda_internal.h:127