doc/main/cuda__internal_8h_source.html

/*

 * Copyright 2026 Stanford University, NVIDIA Corporation, Los Alamos National Laboratory

 * SPDX-License-Identifier: Apache-2.0

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


#ifndef REALM_CUDA_INTERNAL_H

#define REALM_CUDA_INTERNAL_H


#include "realm/cuda/cuda_module.h"


#include <memory>

#include <unordered_map>

#if !defined(CUDA_ENABLE_DEPRECATED)

// Ignore deprecation warnings from cuda headers

#define CUDA_ENABLE_DEPRECATED 1

#endif

#include <cuda.h>

#include <nvml.h>

#include <cupti.h>

#if defined(REALM_USE_CUDART_HIJACK)

#include <cuda_runtime_api.h> // For cudaDeviceProp

#endif


// For CUDA runtime's dim3 definition

#include <vector_types.h>


#include "realm/operation.h"

#include "realm/threads.h"

#include "realm/circ_queue.h"

#include "realm/indexspace.h"

#include "realm/proc_impl.h"

#include "realm/mem_impl.h"

#include "realm/bgwork.h"

#include "realm/transfer/channel.h"

#include "realm/transfer/ib_memory.h"

#include "realm/cuda/cuda_memcpy.h"


#if CUDART_VERSION < 11000


#define CHECK_CUDART(cmd)                                                                \

  do {                                                                                   \

    int ret = (int)(cmd);                                                                \

    if(ret != 0) {                                                                       \

      fprintf(stderr, "CUDART: %s = %d\n", #cmd, ret);                                   \

      assert(0);                                                                         \

      exit(1);                                                                           \

    }                                                                                    \

  } while(0)


#else

// Since CUDA TK11.0, runtime and driver error codes are 1:1 correlated

#define CHECK_CUDART(cmd) CHECK_CU((CUresult)(cmd))

#endif


// Need CUDA 6.5 or later for good error reporting

#if CUDA_VERSION >= 6050

#define REPORT_CU_ERROR(level, cmd, ret)                                                 \

  do {                                                                                   \

    const char *name, *str;                                                              \

    CUDA_DRIVER_FNPTR(Realm::Cuda::cuGetErrorName)(ret, &name);                          \

    CUDA_DRIVER_FNPTR(Realm::Cuda::cuGetErrorString)(ret, &str);                         \

    log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret  \

                          << '(' << name << "): " << str;                                \

  } while(0)

#else


#define REPORT_CU_ERROR(level, cmd, ret)                                                 \

  do {                                                                                   \

    log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret  \

  } while(0)


#endif


#define CHECK_CU(cmd)                                                                    \

  do {                                                                                   \

    CUresult ret = (cmd);                                                                \

    if(ret != CUDA_SUCCESS) {                                                            \

      REPORT_CU_ERROR(Logger::LEVEL_ERROR, #cmd, ret);                                   \

      abort();                                                                           \

    }                                                                                    \

  } while(0)


#define REPORT_NVML_ERROR(level, cmd, ret)                                               \

  do {                                                                                   \

    log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret; \

  } while(0)


#define CHECK_NVML(cmd)                                                                  \

  do {                                                                                   \

    nvmlReturn_t ret = (cmd);                                                            \

    if(ret != NVML_SUCCESS) {                                                            \

      REPORT_NVML_ERROR(Logger::LEVEL_ERROR, #cmd, ret);                                 \

      abort();                                                                           \

    }                                                                                    \

  } while(0)


#define IS_DEFAULT_STREAM(stream)                                                        \

  (((stream) == 0) || ((stream) == CU_STREAM_LEGACY) ||                                  \

   ((stream) == CU_STREAM_PER_THREAD))


#define REPORT_CUPTI_ERROR(level, cmd, ret)                                              \

  do {                                                                                   \

    log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret; \

  } while(0)


#define CHECK_CUPTI(cmd)                                                                 \

  do {                                                                                   \

    CUptiResult ret = (cmd);                                                             \

    if(ret != CUPTI_SUCCESS) {                                                           \

      REPORT_CUPTI_ERROR(Logger::LEVEL_ERROR, #cmd, ret);                                \

      abort();                                                                           \

    }                                                                                    \

  } while(0)


namespace Realm {


  namespace Cuda {


    struct GPUInfo {

      int index; // index used by CUDA runtime

      CUdevice device;

      nvmlDevice_t nvml_dev;

      CUuuid uuid;

      int major;

      int minor;

      static const size_t MAX_NAME_LEN = 256;

      char name[MAX_NAME_LEN];

      size_t totalGlobalMem;

      static const size_t MAX_NUMA_NODE_LEN = 20;

      bool has_numa_preference;

      unsigned long numa_node_affinity[MAX_NUMA_NODE_LEN];

      std::set<CUdevice> peers; // other GPUs we can do p2p copies with

      int pci_busid;

      int pci_domainid;

      int pci_deviceid;

      size_t c2c_bandwidth = 0;      // Current enabled c2c bandwidth

      size_t pci_bandwidth = 0;      // Current enabled pci-e bandwidth

      size_t nvswitch_bandwidth = 0; // Current enabled nvswitch bandwidth

      bool host_gpu_same_va = false;

      std::vector<size_t> logical_peer_bandwidth;

      std::vector<size_t> logical_peer_latency;

      // Fabric information for this gpu

      bool fabric_supported = false;

      unsigned fabric_clique = -1U;

      CUuuid fabric_uuid = {0};

      bool pageable_access_supported = false;


#ifdef REALM_USE_CUDART_HIJACK

      cudaDeviceProp prop;

#endif

    };


    enum GPUMemcpyKind

    {

      GPU_MEMCPY_HOST_TO_DEVICE,

      GPU_MEMCPY_DEVICE_TO_HOST,

      GPU_MEMCPY_DEVICE_TO_DEVICE,

      GPU_MEMCPY_PEER_TO_PEER,

    };


    // Forard declaration

    class GPUProcessor;

    class GPUWorker;

    class GPUStream;

    class GPUFBMemory;

    class GPUDynamicFBMemory;

    class GPUZCMemory;

    class GPUFBIBMemory;

    class GPUAllocation;

    class GPU;

    class CudaModule;


    extern CudaModule *cuda_module_singleton;


    class GPUContextManager : public TaskContextManager {

    public:

      GPUContextManager(GPU *_gpu, GPUProcessor *proc);

      void *create_context(Task *task) const override;

      void destroy_context(Task *task, void *context) const override;

      void *create_context(InternalTask *task) const override;

      void destroy_context(InternalTask *task, void *context) const override;

      GPU *gpu = nullptr;

      GPUProcessor *proc = nullptr; // TODO(cperry): delete me

    };


    // an interface for receiving completion notification for a GPU operation

    //  (right now, just copies)


    class GPUCompletionNotification {

    public:

      virtual ~GPUCompletionNotification(void) {}


      virtual void request_completed(void) = 0;

    };


    class GPUWorkFence : public Realm::Operation::AsyncWorkItem {

    public:

      GPUWorkFence(GPU *gpu, Realm::Operation *op);

      ~GPUWorkFence();


      virtual void mark_finished(bool successful);


      virtual void request_cancellation(void);


      void enqueue_on_stream(GPUStream *stream);


      virtual void print(std::ostream &os) const;


      IntrusiveListLink<GPUWorkFence> fence_list_link;

      REALM_PMTA_DEFN(GPUWorkFence, IntrusiveListLink<GPUWorkFence>, fence_list_link);

      typedef IntrusiveList<GPUWorkFence, REALM_PMTA_USE(GPUWorkFence, fence_list_link),

                            DummyLock>

          FenceList;


    protected:

      static void cuda_callback(CUstream stream, CUresult res, void *data);

      GPU *gpu = nullptr;

    };


    class GPUWorkStart : public Realm::Operation::AsyncWorkItem {

    public:

      GPUWorkStart(Realm::Operation *op);


      virtual void request_cancellation(void) { return; };


      void enqueue_on_stream(GPUStream *stream);


      virtual void print(std::ostream &os) const;


      void mark_gpu_work_start();


    protected:

      static void cuda_start_callback(CUstream stream, CUresult res, void *data);

    };


    // a class that represents a CUDA stream and work associated with

    //  it (e.g. queued copies, events in flight)

    // a stream is also associated with a GPUWorker that it will register

    //  with when async work needs doing


    class GPUStream {

    public:

      GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority = 0);

      ~GPUStream(void);


      GPU *get_gpu(void) const;

      REALM_INTERNAL_API_EXTERNAL_LINKAGE CUstream

      get_stream(void) const; // needed by librealm_kokkos.so


      // may be called by anybody to enqueue a copy or an event

      void add_fence(GPUWorkFence *fence);

      void add_start_event(GPUWorkStart *start);

      void add_notification(GPUCompletionNotification *notification);

      void add_event(CUevent event, GPUWorkFence *fence,

                     GPUCompletionNotification *notification = NULL,

                     GPUWorkStart *start = NULL);

      void wait_on_streams(const std::set<GPUStream *> &other_streams);


      // atomically checks rate limit counters and returns true if 'bytes'

      //  worth of copies can be submitted or false if not (in which case

      //  the progress counter on the xd will be updated when it should try

      //  again)

      bool ok_to_submit_copy(size_t bytes, XferDes *xd);

      bool reap_events(TimeLimit work_until);


    protected:

      // may only be tested with lock held

      bool has_work(void) const;


      GPU *gpu;

      GPUWorker *worker;


      CUstream stream;


      Mutex mutex;


      struct PendingEvent {

        CUevent event;

        GPUWorkFence *fence;

        GPUWorkStart *start;

        GPUCompletionNotification *notification;

      };


#ifdef USE_CQ

      Realm::CircularQueue<PendingEvent> pending_events;

#else

      std::deque<PendingEvent> pending_events;

#endif

    };


    // a GPUWorker is responsible for making progress on one or more GPUStreams -

    //  this may be done directly by a GPUProcessor or in a background thread

    //  spawned for the purpose


    class GPUWorker : public BackgroundWorkItem {

    public:

      GPUWorker(void);

      virtual ~GPUWorker(void);


      // adds a stream that has work to be done

      void add_stream(GPUStream *s);


      // used to start a dedicate thread (mutually exclusive with being

      //  registered with a background work manager)

      void start_background_thread(Realm::CoreReservationSet &crs, size_t stack_size);

      void shutdown_background_thread(void);


      bool do_work(TimeLimit work_until);


    public:

      void thread_main(void);


    protected:

      // used by the background thread

      // processes work on streams, optionally sleeping for work to show up

      // returns true if work remains to be done

      bool process_streams(bool sleep_on_empty);


      Mutex lock;

      Mutex::CondVar condvar;


      typedef CircularQueue<GPUStream *, 16> ActiveStreamQueue;

      ActiveStreamQueue active_streams;


      // used by the background thread (if any)

      Realm::CoreReservation *core_rsrv;

      Realm::Thread *worker_thread;

      bool thread_sleeping;

      atomic<bool> worker_shutdown_requested;

    };


    // a little helper class to manage a pool of CUevents that can be reused

    //  to reduce alloc/destroy overheads


    class GPUEventPool {

    public:

      GPUEventPool(int _batch_size = 256);


      // allocating the initial batch of events and cleaning up are done with

      //  these methods instead of constructor/destructor because we don't

      //  manage the GPU context in this helper class

      void init_pool(int init_size = 0 /* default == batch size */);

      void empty_pool(void);


      CUevent get_event(bool external = false);

      void return_event(CUevent e, bool external = false);


    protected:

      Mutex mutex;

      int batch_size, current_size, total_size, external_count;

      std::vector<CUevent> available_events;

    };


    // when the runtime hijack is not enabled/active, a cuCtxSynchronize

    //  is required to ensure a task's completion event covers all of its

    //  actions - rather than blocking an important thread, we create a

    //  small thread pool to handle these


    class ContextSynchronizer {

    public:

      ContextSynchronizer(GPU *_gpu, CUcontext _context, CoreReservationSet &crs,

                          int _max_threads);

      ~ContextSynchronizer();


      void add_fence(GPUWorkFence *fence);


      void shutdown_threads();


      void thread_main();


    protected:

      GPU *gpu;

      CUcontext context;

      int max_threads;

      Mutex mutex;

      Mutex::CondVar condvar;

      bool shutdown_flag;

      GPUWorkFence::FenceList fences;

      int total_threads, sleeping_threads, syncing_threads;

      std::vector<Thread *> worker_threads;

      CoreReservation *core_rsrv;

    };


    struct FatBin;

    struct RegisteredVariable;

    struct RegisteredFunction;


    // a GPU object represents our use of a given CUDA-capable GPU - this will

    //  have an associated CUDA context, a (possibly shared) worker thread, a

    //  processor, and an FB memory (the ZC memory is shared across all GPUs)


    class GPU {

    public:

      GPU(CudaModule *_module, GPUInfo *_info, GPUWorker *worker, CUcontext _context);

      ~GPU(void);


      void push_context(void);

      void pop_context(void);


      GPUAllocation &add_allocation(GPUAllocation &&alloc);


      void create_processor(RuntimeImpl *runtime, size_t stack_size);

      void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size);

      void create_dynamic_fb_memory(RuntimeImpl *runtime, size_t max_size);


      void create_dma_channels(Realm::RuntimeImpl *r);


      bool can_access_peer(const GPU *peer) const;


      GPUStream *find_stream(CUstream stream) const;

      REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUStream *

      get_null_task_stream(void) const; // needed by librealm_kokkos.so

      GPUStream *get_next_task_stream(bool create = false);

      GPUStream *get_next_d2d_stream();


      void launch_batch_affine_fill_kernel(void *fill_info, size_t dim, size_t elemSize,

                                           size_t volume, GPUStream *stream);

      void launch_batch_affine_kernel(void *copy_info, size_t dim, size_t elemSize,

                                      size_t volume, bool multified_optimized,

                                      GPUStream *stream);

      void launch_transpose_kernel(MemcpyTransposeInfo<size_t> &copy_info,

                                   size_t elemSize, GPUStream *stream);


      void launch_indirect_copy_kernel(void *copy_info, size_t dim, size_t addr_size,

                                       size_t field_size, size_t volume,

                                       GPUStream *stream);

      bool is_accessible_host_mem(const MemoryImpl *mem) const;

      bool is_accessible_gpu_mem(const MemoryImpl *mem) const;


      bool register_reduction(

          ReductionOpID redop_id, CUfunction apply_excl, CUfunction apply_nonexcl,

          CUfunction fold_excl, CUfunction fold_nonexcl, CUfunction apply_excl_advanced,

          CUfunction apply_nonexcl_advanced, CUfunction fold_excl_advanced,

          CUfunction fold_nonexcl_advanced, CUfunction apply_excl_transpose,

          CUfunction apply_nonexcl_transpose, CUfunction fold_excl_transpose,

          CUfunction fold_nonexcl_transpose);


    protected:

      CUmodule load_cuda_module(const void *data);


    public:

      ContextSynchronizer ctxsync;

      CudaModule *module = nullptr;

      GPUInfo *info = nullptr;

      GPUWorker *worker = nullptr;

      GPUProcessor *proc = nullptr;


      std::map<CUdeviceptr, GPUAllocation> allocations;

      GPUFBMemory *fbmem = nullptr;

      GPUDynamicFBMemory *fb_dmem = nullptr;

      GPUFBIBMemory *fb_ibmem = nullptr;


      CUcontext context = nullptr;


      CUmodule device_module = nullptr;


      struct GPUFuncInfo {

        CUfunction func;

        int occ_num_threads;

        int occ_num_blocks;

      };


      // The maximum value of log2(type_bytes) that cuda kernels handle.

      // log2(1 byte)   --> 0

      // log2(2 bytes)  --> 1

      // log2(4 bytes)  --> 2

      // log2(8 bytes)  --> 3

      // log2(16 bytes) --> 4

      static const size_t CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES = 5;


      GPUFuncInfo indirect_copy_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]

                                       [CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES];

      GPUFuncInfo batch_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES];

      GPUFuncInfo multi_batch_affine_kernels[REALM_MAX_DIM]

                                            [CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES];

      GPUFuncInfo batch_fill_affine_kernels[REALM_MAX_DIM]

                                           [CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES];

      GPUFuncInfo fill_affine_large_kernels[REALM_MAX_DIM]

                                           [CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES];

      GPUFuncInfo transpose_kernels[CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES];


      CUdeviceptr fbmem_base = 0;


      CUdeviceptr fb_ibmem_base = 0;


      // which system memories have been registered and can be used for cuMemcpyAsync

      std::set<Memory> pinned_sysmems;


      // managed memories we can concurrently access

      std::set<Memory> managed_mems;


      // which other FBs we have peer access to

      std::set<Memory> peer_fbs;


      // streams for different copy types and a pile for actual tasks

      GPUStream *host_to_device_stream = nullptr;

      GPUStream *device_to_host_stream = nullptr;

      GPUStream *device_to_device_stream = nullptr;

      std::vector<GPUStream *> device_to_device_streams;

      std::vector<GPUStream *> peer_to_peer_streams; // indexed by target

      std::vector<GPUStream *> task_streams;

      atomic<unsigned> next_task_stream = atomic<unsigned>(0);

      atomic<unsigned> next_d2d_stream = atomic<unsigned>(0);

      size_t cupti_activity_refcount = 0;


      GPUEventPool event_pool;


      // this can technically be different in each context (but probably isn't

      //  in practice)

      int least_stream_priority, greatest_stream_priority;


      struct CudaIpcMapping {

        NodeID owner;

        GPU *src_gpu;

        Memory mem;

        uintptr_t local_base;

        uintptr_t address_offset; // add to convert from original to local base

      };


      std::vector<CudaIpcMapping> cudaipc_mappings;

      std::map<NodeID, GPUStream *> cudaipc_streams;

      Mutex alloc_mutex;

      const CudaIpcMapping *find_ipc_mapping(Memory mem) const;


      struct GPUReductionOpEntry {

        CUfunction apply_nonexcl = nullptr;

        CUfunction apply_excl = nullptr;

        CUfunction fold_nonexcl = nullptr;

        CUfunction fold_excl = nullptr;

        CUfunction apply_nonexcl_advanced = nullptr;

        CUfunction apply_excl_advanced = nullptr;

        CUfunction fold_nonexcl_advanced = nullptr;

        CUfunction fold_excl_advanced = nullptr;

        CUfunction apply_nonexcl_transpose = nullptr;

        CUfunction apply_excl_transpose = nullptr;

        CUfunction fold_nonexcl_transpose = nullptr;

        CUfunction fold_excl_transpose = nullptr;

      };


      std::unordered_map<ReductionOpID, GPUReductionOpEntry> gpu_reduction_table;

    };


    // helper to push/pop a GPU's context by scope


    class AutoGPUContext {

    public:

      AutoGPUContext(GPU &_gpu);

      AutoGPUContext(GPU *_gpu);

      ~AutoGPUContext(void);


    protected:

      GPU *gpu;

    };


    class REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUProcessor // needed by librealm_kokkos.so

      : public Realm::LocalTaskProcessor {

    public:

      GPUProcessor(RuntimeImpl *runtime_impl, GPU *_gpu, Processor _me,

                   Realm::CoreReservationSet &crs, size_t _stack_size);

      virtual ~GPUProcessor(void);


    public:

      virtual bool register_task(Processor::TaskFuncID func_id, CodeDescriptor &codedesc,

                                 const ByteArrayRef &user_data);


      virtual void shutdown(void);


    protected:

      virtual void execute_task(Processor::TaskFuncID func_id,

                                const ByteArrayRef &task_args);


    public:

      GPU *gpu;


    protected:

      Realm::CoreReservation *core_rsrv;


      struct GPUTaskTableEntry {

        Processor::TaskFuncPtr fnptr;

        Cuda::StreamAwareTaskFuncPtr stream_aware_fnptr;

        ByteArray user_data;

      };


      // we're not using the parent's task table, but we can use the mutex

      // RWLock task_table_mutex;

      std::map<Processor::TaskFuncID, GPUTaskTableEntry> gpu_task_table;

    };


    // this can be attached to any MemoryImpl if the underlying memory is

    //  guaranteed to belong to a given CUcontext - this will allow that

    //  context's processor and dma channels to work with it

    // the creator is expected to know what CUcontext they want but need

    //  not know which GPU object that corresponds to


    class CudaDeviceMemoryInfo : public ModuleSpecificInfo {

    public:

      CudaDeviceMemoryInfo(CUcontext _context);


      CUcontext context;

      GPU *gpu;

    };


    class GPUFBMemory : public LocalManagedMemory {

    public:

      GPUFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base,

                  size_t _size);


      virtual ~GPUFBMemory(void);


      // these work, but they are SLOW

      virtual void get_bytes(off_t offset, void *dst, size_t size);

      virtual void put_bytes(off_t offset, const void *src, size_t size);


      virtual void *get_direct_ptr(off_t offset, size_t size);


      // GPUFBMemory supports ExternalCudaMemoryResource and

      //  ExternalCudaArrayResource

      virtual bool attempt_register_external_resource(RegionInstanceImpl *inst,

                                                      size_t &inst_offset);

      virtual void unregister_external_resource(RegionInstanceImpl *inst);


      // for re-registration purposes, generate an ExternalInstanceResource *

      //  (if possible) for a given instance, or a subset of one

      virtual ExternalInstanceResource *

      generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace,

                             span<const FieldID> fields, bool read_only);


    public:

      GPU *gpu;

      CUdeviceptr base;

      NetworkSegment local_segment;

    };


    class GPUDynamicFBMemory : public MemoryImpl {

    public:

      GPUDynamicFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu,

                         size_t _max_size);


      virtual ~GPUDynamicFBMemory(void);

      void cleanup(void);


      // deferred allocation not supported

      virtual AllocationResult allocate_storage_immediate(RegionInstanceImpl *inst,

                                                          bool need_alloc_result,

                                                          bool poisoned,

                                                          TimeLimit work_until);


      virtual void release_storage_immediate(RegionInstanceImpl *inst, bool poisoned,

                                             TimeLimit work_until);


      // these work, but they are SLOW

      virtual void get_bytes(off_t offset, void *dst, size_t size);

      virtual void put_bytes(off_t offset, const void *src, size_t size);


      virtual void *get_direct_ptr(off_t offset, size_t size);


      // GPUDynamicFBMemory supports ExternalCudaMemoryResource and

      //  ExternalCudaArrayResource

      virtual bool attempt_register_external_resource(RegionInstanceImpl *inst,

                                                      size_t &inst_offset);

      virtual void unregister_external_resource(RegionInstanceImpl *inst);


      // for re-registration purposes, generate an ExternalInstanceResource *

      //  (if possible) for a given instance, or a subset of one

      virtual ExternalInstanceResource *

      generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace,

                             span<const FieldID> fields, bool read_only);


    public:

      GPU *gpu;

      Mutex mutex;

      size_t cur_size;

      std::map<RegionInstance, std::pair<CUdeviceptr, size_t>> alloc_bases;

      NetworkSegment local_segment;

    };


    class GPUZCMemory : public LocalManagedMemory {

    public:

      GPUZCMemory(RuntimeImpl *_runtime_impl, GPU *gpu, Memory _me, CUdeviceptr _gpu_base,

                  void *_cpu_base, size_t _size, MemoryKind _kind,

                  Memory::Kind _lowlevel_kind);


      virtual ~GPUZCMemory(void);


      virtual void get_bytes(off_t offset, void *dst, size_t size);


      virtual void put_bytes(off_t offset, const void *src, size_t size);


      virtual void *get_direct_ptr(off_t offset, size_t size);


      // GPUZCMemory supports ExternalCudaPinnedHostResource

      virtual bool attempt_register_external_resource(RegionInstanceImpl *inst,

                                                      size_t &inst_offset);

      virtual void unregister_external_resource(RegionInstanceImpl *inst);


      // for re-registration purposes, generate an ExternalInstanceResource *

      //  (if possible) for a given instance, or a subset of one

      virtual ExternalInstanceResource *

      generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace,

                             span<const FieldID> fields, bool read_only);


    public:

      CUdeviceptr gpu_base;

      char *cpu_base;

      NetworkSegment local_segment;

    };


    class GPUFBIBMemory : public IBMemory {

    public:

      GPUFBIBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base,

                    size_t _size);


    public:

      GPU *gpu;

      CUdeviceptr base;

      NetworkSegment local_segment;

    };


    class GPURequest;


    class GPUCompletionEvent : public GPUCompletionNotification {

    public:

      void request_completed(void);


      GPURequest *req;

    };


    class GPURequest : public Request {

    public:

      const void *src_base;

      void *dst_base;

      // off_t src_gpu_off, dst_gpu_off;

      GPU *dst_gpu;

      GPUCompletionEvent event;

    };


    class GPUIndirectTransferCompletion : public GPUCompletionNotification {

    public:

      GPUIndirectTransferCompletion(

          XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size,

          int _write_port_idx, size_t _write_offset, size_t _write_size,

          int _read_ind_port_idx = -1, size_t _read_ind_offset = 0,

          size_t _read_ind_size = 0, int _write_ind_port_idx = -1,

          size_t _write_ind_offset = 0, size_t _write_ind_size = 0);


      virtual void request_completed(void);


    protected:

      XferDes *xd;

      int read_port_idx;

      size_t read_offset, read_size;

      int read_ind_port_idx;

      size_t read_ind_offset, read_ind_size;

      int write_port_idx;

      size_t write_offset, write_size;

      int write_ind_port_idx;

      size_t write_ind_offset, write_ind_size;

    };


    class GPUTransferCompletion : public GPUCompletionNotification {

    public:

      GPUTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset,

                            size_t _read_size, int _write_port_idx, size_t _write_offset,

                            size_t _write_size);


      virtual void request_completed(void);


    protected:

      XferDes *xd;

      int read_port_idx;

      size_t read_offset, read_size;

      int write_port_idx;

      size_t write_offset, write_size;

    };


    class MemSpecificCudaArray : public MemSpecificInfo {

    public:

      MemSpecificCudaArray(CUarray _array);

      virtual ~MemSpecificCudaArray();


      CUarray array;

    };


    class AddressInfoCudaArray : public TransferIterator::AddressInfoCustom {

    public:

      virtual int set_rect(const RegionInstanceImpl *inst,

                           const InstanceLayoutPieceBase *piece, size_t field_size,

                           size_t field_offset, int ndims, const int64_t lo[/*ndims*/],

                           const int64_t hi[/*ndims*/], const int order[/*ndims*/]);


      CUarray array;

      int dim;

      size_t pos[3];

      size_t width_in_bytes, height, depth;

    };


    class GPUChannel;


    class GPUXferDes : public XferDes {

    public:

      GPUXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,

                 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,

                 const std::vector<XferDesPortInfo> &outputs_info, int _priority);


      long get_requests(Request **requests, long nr);


      bool progress_xd(GPUChannel *channel, TimeLimit work_until);


      static size_t read_address_entry(AffineCopyInfo<3> &copy_infos, size_t &min_align,

                                       MemcpyTransposeInfo<size_t> &transpose_info,

                                       AddressListCursor &in_alc, uintptr_t in_base,

                                       AddressListCursor &out_alc, uintptr_t out_base,

                                       size_t bytes_left, size_t max_xfer_fields,

                                       size_t &fields_total);


    private:

      std::vector<GPU *> src_gpus, dst_gpus;

      std::vector<bool> dst_is_ipc;


      // Mininum amount to transfer in a single quantum before returning in order to

      // ensure forward progress

      // TODO: make controllable

      static constexpr size_t min_xfer_size = 4 << 20;

      // Maximum amount to transfer in a single quantum in order to ensure other requests

      // have a chance to make forward progress.  This should be large enough that the

      // overhead of splitting the copy shouldn't be noticable in terms of latency (4GiB

      // should be good here for most purposes)

      // TODO: make controllable

      static constexpr size_t max_xfer_size = 4ULL * 1024ULL * 1024ULL * 1024ULL;

      static constexpr size_t max_xfer_fields = 2000;

    };


    class GPUIndirectChannel;


    class GPUIndirectXferDes : public XferDes {

    public:

      GPUIndirectXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,

                         XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,

                         const std::vector<XferDesPortInfo> &outputs_info, int _priority,

                         XferDesRedopInfo _redop_info);


      long get_requests(Request **requests, long nr);

      bool progress_xd(GPUIndirectChannel *channel, TimeLimit work_until);


    protected:

      std::vector<GPU *> src_gpus, dst_gpus;

      std::vector<bool> dst_is_ipc;

    };


    class GPUIndirectChannel

      : public SingleXDQChannel<GPUIndirectChannel, GPUIndirectXferDes> {

    public:

      GPUIndirectChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork);

      ~GPUIndirectChannel();


      // multi-threading of cuda copies for a given device is disabled by

      //  default (can be re-enabled with -cuda:mtdma 1)

      static const bool is_ordered = true;


      virtual bool needs_wrapping_iterator() const;

      virtual Memory suggest_ib_memories() const;


      virtual RemoteChannelInfo *construct_remote_info() const;


      virtual uint64_t

      supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id,

                    CustomSerdezID dst_serdez_id, ReductionOpID redop_id,

                    size_t total_bytes, const std::vector<size_t> *src_frags,

                    const std::vector<size_t> *dst_frags, XferDesKind *kind_ret = 0,

                    unsigned *bw_ret = 0, unsigned *lat_ret = 0);


      virtual bool supports_indirection_memory(Memory mem) const;


      virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,

                                       XferDesID guid,

                                       const std::vector<XferDesPortInfo> &inputs_info,

                                       const std::vector<XferDesPortInfo> &outputs_info,

                                       int priority, XferDesRedopInfo redop_info,

                                       const void *fill_data, size_t fill_size,

                                       size_t fill_total);


      long submit(Request **requests, long nr);

      GPU *get_gpu() const { return src_gpu; }


    protected:

      friend class GPUIndirectXferDes;

      GPU *src_gpu;

    };


    class GPUIndirectRemoteChannelInfo : public SimpleRemoteChannelInfo {

    public:

      GPUIndirectRemoteChannelInfo(NodeID _owner, XferDesKind _kind,

                                   uintptr_t _remote_ptr,

                                   const std::vector<Channel::SupportedPath> &_paths,

                                   const std::vector<Memory> &_indirect_memories);


      virtual RemoteChannel *create_remote_channel();


      template <typename S>

      bool serialize(S &serializer) const;


      template <typename S>

      static RemoteChannelInfo *deserialize_new(S &deserializer);


    protected:

      static Serialization::PolymorphicSerdezSubclass<RemoteChannelInfo,

                                                      GPUIndirectRemoteChannelInfo>

          serdez_subclass;

    };


    class GPUIndirectRemoteChannel : public RemoteChannel {

      friend class GPUIndirectRemoteChannelInfo;


    public:

      GPUIndirectRemoteChannel(uintptr_t _remote_ptr,

                               const std::vector<Memory> &_indirect_memories);

      virtual Memory suggest_ib_memories() const;

      virtual bool needs_wrapping_iterator() const;

      virtual uint64_t

      supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id,

                    CustomSerdezID dst_serdez_id, ReductionOpID redop_id,

                    size_t total_bytes, const std::vector<size_t> *src_frags,

                    const std::vector<size_t> *dst_frags, XferDesKind *kind_ret /*= 0*/,

                    unsigned *bw_ret /*= 0*/, unsigned *lat_ret /*= 0*/);

    };


    class GPUChannel : public SingleXDQChannel<GPUChannel, GPUXferDes> {

    public:

      GPUChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork);

      ~GPUChannel();


      // multi-threading of cuda copies for a given device is disabled by

      //  default (can be re-enabled with -cuda:mtdma 1)

      static const bool is_ordered = true;


      virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,

                                       XferDesID guid,

                                       const std::vector<XferDesPortInfo> &inputs_info,

                                       const std::vector<XferDesPortInfo> &outputs_info,

                                       int priority, XferDesRedopInfo redop_info,

                                       const void *fill_data, size_t fill_size,

                                       size_t fill_total);


      long submit(Request **requests, long nr);

      GPU *get_gpu() const { return src_gpu; }


      virtual RemoteChannelInfo *construct_remote_info() const;


      virtual bool support_idindexed_fields(Memory src_mem, Memory dst_mem) const

      {

        return true;

      }


    private:

      GPU *src_gpu;

    };


    class GPURemoteChannelInfo : public SimpleRemoteChannelInfo {

    public:

      GPURemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr,

                           const std::vector<Channel::SupportedPath> &_paths);


      virtual RemoteChannel *create_remote_channel();


      template <typename S>

      bool serialize(S &serializer) const;


      template <typename S>

      static RemoteChannelInfo *deserialize_new(S &deserializer);


    protected:

      static Serialization::PolymorphicSerdezSubclass<RemoteChannelInfo,

                                                      GPURemoteChannelInfo>

          serdez_subclass;

    };


    class GPURemoteChannel : public RemoteChannel {

      friend class GPURemoteChannelInfo;


      GPURemoteChannel(uintptr_t _remote_ptr);


    public:


      virtual bool support_idindexed_fields(Memory src_mem, Memory dst_mem) const

      {

        return true;

      }


    };


    class GPUfillChannel;


    class GPUfillXferDes : public XferDes {

    public:

      GPUfillXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,

                     XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,

                     const std::vector<XferDesPortInfo> &outputs_info, int _priority,

                     const void *_fill_data, size_t _fill_size, size_t _fill_total);


      long get_requests(Request **requests, long nr);


      bool progress_xd(GPUfillChannel *channel, TimeLimit work_until);


    protected:

      size_t reduced_fill_size;

    };


    class GPUfillChannel : public SingleXDQChannel<GPUfillChannel, GPUfillXferDes> {

    public:

      GPUfillChannel(GPU *_gpu, BackgroundWorkManager *bgwork);


      // multiple concurrent cuda fills ok

      static const bool is_ordered = false;


      virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,

                                       XferDesID guid,

                                       const std::vector<XferDesPortInfo> &inputs_info,

                                       const std::vector<XferDesPortInfo> &outputs_info,

                                       int priority, XferDesRedopInfo redop_info,

                                       const void *fill_data, size_t fill_size,

                                       size_t fill_total);


      long submit(Request **requests, long nr);


    protected:

      friend class GPUfillXferDes;


      GPU *gpu;

    };


    class GPUreduceChannel;


    struct KernelVariantDesc {

      void *host_proxy;

      CUfunction GPU::GPUReductionOpEntry::*cache_field;

    };


    class GPUreduceXferDes : public XferDes {

    public:

      GPUreduceXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,

                       XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,

                       const std::vector<XferDesPortInfo> &outputs_info, int _priority,

                       XferDesRedopInfo _redop_info);


      long get_requests(Request **requests, long nr);


      bool progress_xd(GPUreduceChannel *channel, TimeLimit work_until);

      bool fast_reduction_kernel_mode(GPUreduceChannel *channel, const size_t max_bytes,

                                      XferPort *in_port, XferPort *out_port,

                                      const size_t in_span_start,

                                      const size_t out_span_start);


      void setup_redop_kernel(GPUreduceChannel *channel, void *params,

                              const size_t in_span_start, const size_t out_span_start,

                              const size_t in_elem_size, const size_t out_elem_size,

                              const size_t elems, const bool has_transpose);

      void record_redop_advanced_kernel(GPU *gpu);


      KernelVariantDesc describe_kernel_variant(GPU *cpu, bool is_advanced);

      bool resolve_kernel_slot(GPU *gpu, void *host_proxy, CUfunction &kernel_out,

                               CUfunction GPU::GPUReductionOpEntry::*cache_field);


    protected:

      XferDesRedopInfo redop_info;

      const ReductionOpUntyped *redop;

      CUfunction kernel;

      CUfunction kernel_advanced;

      CUfunction kernel_transpose;

      const void *kernel_host_proxy;

      const void *kernel_host_proxy_advanced;

      const void *kernel_host_proxy_transpose;

      GPUStream *stream;

      std::vector<GPU *> src_gpus;

      std::vector<bool> src_is_ipc;

    };


    class GPUreduceChannel : public SingleXDQChannel<GPUreduceChannel, GPUreduceXferDes> {

    public:

      GPUreduceChannel(GPU *_gpu, BackgroundWorkManager *bgwork);


      // multiple concurrent cuda reduces ok

      static const bool is_ordered = false;


      // helper method here so that GPUreduceRemoteChannel can use it too

      bool supports_redop(ReductionOpID redop_id) const override;


      RemoteChannelInfo *construct_remote_info() const override;


      XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid,

                               const std::vector<XferDesPortInfo> &inputs_info,

                               const std::vector<XferDesPortInfo> &outputs_info,

                               int priority, XferDesRedopInfo redop_info,

                               const void *fill_data, size_t fill_size,

                               size_t fill_total) override;


      long submit(Request **requests, long nr) override;


    protected:

      friend class GPUreduceXferDes;


      GPU *gpu;

    };


    class GPUreduceRemoteChannelInfo : public SimpleRemoteChannelInfo {

    public:

      GPUreduceRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr,

                                 const std::vector<Channel::SupportedPath> &_paths);


      virtual RemoteChannel *create_remote_channel();


      template <typename S>

      bool serialize(S &serializer) const;


      template <typename S>

      static RemoteChannelInfo *deserialize_new(S &deserializer);


    protected:

      static Serialization::PolymorphicSerdezSubclass<RemoteChannelInfo,

                                                      GPUreduceRemoteChannelInfo>

          serdez_subclass;

    };


    class GPUreduceRemoteChannel : public RemoteChannel {

      friend class GPUreduceRemoteChannelInfo;


      GPUreduceRemoteChannel(uintptr_t _remote_ptr);

    };


    // active message for establishing cuda ipc mappings


    struct CudaIpcImportRequest {

      unsigned count = 0;

#if !defined(REALM_IS_WINDOWS)

      long hostid = 0;

#endif

      static void handle_message(NodeID sender, const CudaIpcImportRequest &args,

                                 const void *data, size_t datalen);

    };


    class GPUReplHeapListener : public ReplicatedHeap::Listener {

    public:

      GPUReplHeapListener(CudaModule *_module);


      virtual void chunk_created(void *base, size_t bytes);

      virtual void chunk_destroyed(void *base, size_t bytes);


    protected:

      CudaModule *module;

    };


    class GPUAllocation {

    public:

      // -- Constructors --

      GPUAllocation(void) = default;

      GPUAllocation(GPUAllocation &&other) noexcept;

      GPUAllocation(const GPUAllocation &) = delete;

      GPUAllocation &operator=(GPUAllocation &&) noexcept;

      GPUAllocation &operator=(const GPUAllocation &) = delete;

      ~GPUAllocation();


      // --- Accessors ---

      inline operator bool(void) const { return dev_ptr != 0; }

      OsHandle get_os_handle(void) const;


      inline bool get_ipc_handle(CUipcMemHandle &handle) const

      {

        if(has_ipc_handle) {

          handle = ipc_handle;

        }

        return has_ipc_handle;

      }


#if CUDA_VERSION >= 12030

      bool get_fabric_handle(CUmemFabricHandle &handle) const;

#endif

      inline CUdeviceptr get_dptr(void) const { return dev_ptr; }

      inline GPU *get_gpu(void) const { return gpu; }

      inline size_t get_size(void) const { return size; }


      template <typename T = void>


      T *get_hptr(void) const

      {

        return static_cast<T *>(host_ptr);

      }


      static void *get_win32_shared_attributes(void);


      // -- Allocators --


      static GPUAllocation *allocate_dev(GPU *gpu, size_t size, bool peer_enabled = true,

                                         bool shareable = true);

#if CUDA_VERSION >= 11000

      static GPUAllocation *allocate_mmap(GPU *gpu, const CUmemAllocationProp &prop,

                                          size_t size, CUdeviceptr vaddr = 0,

                                          bool peer_enabled = true);

#endif

      static GPUAllocation *allocate_host(GPU *gpu, size_t size, bool peer_enabled = true,

                                          bool shareable = true, bool same_va = true);

      static GPUAllocation *allocate_managed(GPU *gpu, size_t size);

      static GPUAllocation *register_allocation(GPU *gpu, void *ptr, size_t size,

                                                bool peer_enabled = true);

      static GPUAllocation *open_ipc(GPU *gpu, const CUipcMemHandle &mem_hdl);

      static GPUAllocation *open_handle(GPU *gpu, OsHandle hdl, size_t size,

                                        bool peer_enabled = true);

#if CUDA_VERSION >= 12030

      static GPUAllocation *open_fabric(GPU *gpu, const CUmemFabricHandle &hdl,

                                        size_t size, bool peer_enabled = true,

                                        bool is_local = false);

#endif


    private:

      CUresult map_allocation(GPU *gpu, CUmemGenericAllocationHandle handle, size_t size,

                              CUdeviceptr va = 0, size_t offset = 0,

                              bool peer_enabled = false, bool map_host = false);


#if CUDA_VERSION >= 11000

      static size_t align_size(const CUmemAllocationProp &prop, size_t size);

#endif

      // -- Deleters --

      typedef void (*DeleterCallback)(GPUAllocation &alloc);


      // These are helper functions to manage what freeing strategy needs to be used to

      // properly free the allocation

      static void cuda_malloc_free(GPUAllocation &alloc);

      static void cuda_malloc_host_free(GPUAllocation &alloc);

      static void cuda_register_free(GPUAllocation &alloc);

      static void cuda_ipc_free(GPUAllocation &alloc);

#if CUDA_VERSION >= 11000

      static void cuda_memmap_free(GPUAllocation &alloc);

#endif


      // -- Members --

      GPU *gpu = nullptr;

      CUdeviceptr dev_ptr = 0;

      void *host_ptr = nullptr;

      size_t size = 0;

      DeleterCallback deleter = nullptr;

#if CUDA_VERSION >= 11000

      CUmemGenericAllocationHandle mmap_handle = 0;

      // True if VA needs to be released for cuMemMap'ed memory

      // or if the registered memory actually needs to be unregistered

      bool owns_va = true;

#endif

      bool has_ipc_handle = false;

      CUipcMemHandle ipc_handle;

    };


    // Define these APIs locally here if we know the definition isn't in cuda.h.  This

    // allows us to use this driver function even if it is unavailable to our current

    // toolkit


#if CUDA_VERSION < 11030

#define CU_GET_PROC_ADDRESS_DEFAULT 0

    CUresult cuGetProcAddress(const char *, void **, int, int);

#endif


#if CUDA_VERSION < 12050

    CUresult cuCtxRecordEvent(CUcontext hctx, CUevent event);

#endif


#if CUDA_VERSION >= 13000

// Unfortunately, 13.0 violates it's own source compatibility rules versus

// cuGetProcAddress, so fix that ourselves here.

#if !defined(cuCtxGetDevice)

#define cuCtxGetDevice cuCtxGetDevice_v2

#endif

#if !defined(cuCtxSynchronize)

#define cuCtxSynchronize cuCtxSynchronize_v2

#endif

#if !defined(cuStreamGetCtx)

#define cuStreamGetCtx cuStreamGetCtx_v2

#endif

#endif


    // cuda driver and/or runtime entry points

#define CUDA_DRIVER_HAS_FNPTR(name) ((name##_fnptr) != nullptr)

#define CUDA_DRIVER_FNPTR(name) (assert(name##_fnptr != nullptr), name##_fnptr)


// Only APIs that are available in the minimum base driver version that Realm supports

// should be listed here


// Note: it is imperative for APIs introduced in minor versions after

// the minimum version defined above to explicitly denote the version they were

// introduced, otherwise it is possible to retrieve the wrong API and crash when called.


// The mininum base driver version Realm supports

#define CUDA_VERSION_MIN 11080

// Source compatible version of cuda.h (the minimum version where the decltype(&fn)

// matches the function returned from cuGetProcAddress(fn, CUDA_VERSION_COMPAT) )

#define CUDA_VERSION_COMPAT ((CUDA_VERSION / 1000) * 1000)


#define CUDA_DRIVER_APIS(__op__)                                                         \

  __op__(cuModuleGetFunction, CUDA_VERSION_MIN);                                         \

  __op__(cuCtxGetDevice, CUDA_VERSION_MIN);                                              \

  __op__(cuCtxEnablePeerAccess, CUDA_VERSION_MIN);                                       \

  __op__(cuCtxGetFlags, CUDA_VERSION_MIN);                                               \

  __op__(cuCtxGetStreamPriorityRange, CUDA_VERSION_MIN);                                 \

  __op__(cuCtxPopCurrent, CUDA_VERSION_MIN);                                             \

  __op__(cuCtxPushCurrent, CUDA_VERSION_MIN);                                            \

  __op__(cuCtxSynchronize, CUDA_VERSION_MIN);                                            \

  __op__(cuDeviceCanAccessPeer, CUDA_VERSION_MIN);                                       \

  __op__(cuDeviceGet, CUDA_VERSION_MIN);                                                 \

  __op__(cuDeviceGetUuid, CUDA_VERSION_MIN);                                             \

  __op__(cuDeviceGetAttribute, CUDA_VERSION_MIN);                                        \

  __op__(cuDeviceGetCount, CUDA_VERSION_MIN);                                            \

  __op__(cuDeviceGetName, CUDA_VERSION_MIN);                                             \

  __op__(cuDevicePrimaryCtxRelease, CUDA_VERSION_MIN);                                   \

  __op__(cuDevicePrimaryCtxRetain, CUDA_VERSION_MIN);                                    \

  __op__(cuDevicePrimaryCtxSetFlags, CUDA_VERSION_MIN);                                  \

  __op__(cuDeviceTotalMem, CUDA_VERSION_MIN);                                            \

  __op__(cuEventCreate, CUDA_VERSION_MIN);                                               \

  __op__(cuEventDestroy, CUDA_VERSION_MIN);                                              \

  __op__(cuEventQuery, CUDA_VERSION_MIN);                                                \

  __op__(cuEventRecord, CUDA_VERSION_MIN);                                               \

  __op__(cuGetErrorName, CUDA_VERSION_MIN);                                              \

  __op__(cuGetErrorString, CUDA_VERSION_MIN);                                            \

  __op__(cuInit, CUDA_VERSION_MIN);                                                      \

  __op__(cuIpcCloseMemHandle, CUDA_VERSION_MIN);                                         \

  __op__(cuIpcGetMemHandle, CUDA_VERSION_MIN);                                           \

  __op__(cuIpcOpenMemHandle, CUDA_VERSION_MIN);                                          \

  __op__(cuLaunchKernel, CUDA_VERSION_MIN);                                              \

  __op__(cuMemAllocManaged, CUDA_VERSION_MIN);                                           \

  __op__(cuMemAlloc, CUDA_VERSION_MIN);                                                  \

  __op__(cuMemcpy2DAsync, CUDA_VERSION_MIN);                                             \

  __op__(cuMemcpy3DAsync, CUDA_VERSION_MIN);                                             \

  __op__(cuMemcpyAsync, CUDA_VERSION_MIN);                                               \

  __op__(cuMemcpyDtoDAsync, CUDA_VERSION_MIN);                                           \

  __op__(cuMemcpyDtoH, CUDA_VERSION_MIN);                                                \

  __op__(cuMemcpyDtoHAsync, CUDA_VERSION_MIN);                                           \

  __op__(cuMemcpyHtoD, CUDA_VERSION_MIN);                                                \

  __op__(cuMemcpyHtoDAsync, CUDA_VERSION_MIN);                                           \

  __op__(cuMemFreeHost, CUDA_VERSION_MIN);                                               \

  __op__(cuMemFree, CUDA_VERSION_MIN);                                                   \

  __op__(cuMemGetInfo, CUDA_VERSION_MIN);                                                \

  __op__(cuMemHostAlloc, CUDA_VERSION_MIN);                                              \

  __op__(cuMemHostGetDevicePointer, CUDA_VERSION_MIN);                                   \

  __op__(cuMemHostRegister, CUDA_VERSION_MIN);                                           \

  __op__(cuMemHostUnregister, CUDA_VERSION_MIN);                                         \

  __op__(cuMemsetD16Async, CUDA_VERSION_MIN);                                            \

  __op__(cuMemsetD2D16Async, CUDA_VERSION_MIN);                                          \

  __op__(cuMemsetD2D32Async, CUDA_VERSION_MIN);                                          \

  __op__(cuMemsetD2D8Async, CUDA_VERSION_MIN);                                           \

  __op__(cuMemsetD32Async, CUDA_VERSION_MIN);                                            \

  __op__(cuMemsetD8Async, CUDA_VERSION_MIN);                                             \

  __op__(cuModuleLoadDataEx, CUDA_VERSION_MIN);                                          \

  __op__(cuStreamAddCallback, CUDA_VERSION_MIN);                                         \

  __op__(cuStreamCreate, CUDA_VERSION_MIN);                                              \

  __op__(cuStreamCreateWithPriority, CUDA_VERSION_MIN);                                  \

  __op__(cuStreamDestroy, CUDA_VERSION_MIN);                                             \

  __op__(cuStreamSynchronize, CUDA_VERSION_MIN);                                         \

  __op__(cuOccupancyMaxPotentialBlockSize, CUDA_VERSION_MIN);                            \

  __op__(cuOccupancyMaxPotentialBlockSizeWithFlags, CUDA_VERSION_MIN);                   \

  __op__(cuEventSynchronize, CUDA_VERSION_MIN);                                          \

  __op__(cuEventElapsedTime, CUDA_VERSION_MIN);                                          \

  __op__(cuOccupancyMaxActiveBlocksPerMultiprocessor, CUDA_VERSION_MIN);                 \

  __op__(cuMemAddressReserve, CUDA_VERSION_MIN);                                         \

  __op__(cuMemAddressFree, CUDA_VERSION_MIN);                                            \

  __op__(cuMemCreate, CUDA_VERSION_MIN);                                                 \

  __op__(cuMemRelease, CUDA_VERSION_MIN);                                                \

  __op__(cuMemMap, CUDA_VERSION_MIN);                                                    \

  __op__(cuMemUnmap, CUDA_VERSION_MIN);                                                  \

  __op__(cuMemSetAccess, CUDA_VERSION_MIN);                                              \

  __op__(cuMemGetAllocationGranularity, CUDA_VERSION_MIN);                               \

  __op__(cuMemGetAllocationPropertiesFromHandle, CUDA_VERSION_MIN);                      \

  __op__(cuMemExportToShareableHandle, CUDA_VERSION_MIN);                                \

  __op__(cuMemImportFromShareableHandle, CUDA_VERSION_MIN);                              \

  __op__(cuStreamWaitEvent, CUDA_VERSION_MIN);                                           \

  __op__(cuStreamQuery, CUDA_VERSION_MIN);                                               \

  __op__(cuMemGetAddressRange, CUDA_VERSION_MIN);                                        \

  __op__(cuPointerGetAttributes, CUDA_VERSION_MIN);                                      \

  __op__(cuDriverGetVersion, CUDA_VERSION_MIN);                                          \

  __op__(cuMemAdvise, CUDA_VERSION_MIN);                                                 \

  __op__(cuMemPrefetchAsync, CUDA_VERSION_MIN);                                          \

  __op__(cuCtxSetSharedMemConfig, CUDA_VERSION_MIN);                                     \

  __op__(cuCtxSetCacheConfig, CUDA_VERSION_MIN);                                         \

  __op__(cuCtxSetLimit, CUDA_VERSION_MIN);                                               \

  __op__(cuCtxGetLimit, CUDA_VERSION_MIN);                                               \

  __op__(cuFuncSetAttribute, CUDA_VERSION_MIN);                                          \

  __op__(cuFuncSetCacheConfig, CUDA_VERSION_MIN);                                        \

  __op__(cuFuncSetSharedMemConfig, CUDA_VERSION_MIN);                                    \

  __op__(cuFuncGetAttribute, CUDA_VERSION_MIN);                                          \

  __op__(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, CUDA_VERSION_MIN);        \

  __op__(cuArray3DCreate, CUDA_VERSION_MIN);                                             \

  __op__(cuArrayDestroy, CUDA_VERSION_MIN);                                              \

  __op__(cuSurfObjectCreate, CUDA_VERSION_MIN);                                          \

  __op__(cuSurfObjectDestroy, CUDA_VERSION_MIN);                                         \

  __op__(cuLaunchCooperativeKernel, CUDA_VERSION_MIN);                                   \

  __op__(cuModuleGetGlobal, CUDA_VERSION_MIN);                                           \

  __op__(cuLaunchHostFunc, CUDA_VERSION_MIN);                                            \

  __op__(cuCtxRecordEvent, 12050);                                                       \

  __op__(cuArrayGetMemoryRequirements, CUDA_VERSION_MIN);


// Make sure to only use decltype, to ensure it matches the cuda.h definition

#define DECL_FNPTR_EXTERN(name, ver) extern decltype(&name) name##_fnptr;

    CUDA_DRIVER_APIS(DECL_FNPTR_EXTERN);

#undef DECL_FNPTR_EXTERN


#define NVML_FNPTR(name) (name##_fnptr)


#if NVML_API_VERSION >= 11

#define NVML_11_APIS(__op__) __op__(nvmlDeviceGetMemoryAffinity);

#else

#define NVML_11_APIS(__op__)

#endif


#if NVML_API_VERSION >= 12

#define NVML_12_APIS(__op__) __op__(nvmlDeviceGetGpuFabricInfo)

#else

#define NVML_12_APIS(__op__)

#endif


#if CUDA_VERSION < 11040

    // Define an NVML api that doesn't exist prior to CUDA Toolkit 11.5, but should

    // exist in systems that require it that we need to support (we'll detect it's

    // availability later)

    //

    // Although these are NVML apis, NVML_API_VERSION doesn't support any way to detect

    // minor versioning, so we'll use the cuda header's versioning here, which should

    // coincide with the versions we're looking for


    typedef enum nvmlIntNvLinkDeviceType_enum

    {

      NVML_NVLINK_DEVICE_TYPE_GPU = 0x00,

      NVML_NVLINK_DEVICE_TYPE_IBMNPU = 0x01,

      NVML_NVLINK_DEVICE_TYPE_SWITCH = 0x02,

      NVML_NVLINK_DEVICE_TYPE_UNKNOWN = 0xFF

    } nvmlIntNvLinkDeviceType_t;


    nvmlReturn_t

    nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link,

                                        nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType);

#endif


#define NVML_APIS(__op__)                                                                \

  __op__(nvmlInit);                                                                      \

  __op__(nvmlDeviceGetHandleByUUID);                                                     \

  __op__(nvmlDeviceGetMaxPcieLinkWidth);                                                 \

  __op__(nvmlDeviceGetMaxPcieLinkGeneration);                                            \

  __op__(nvmlDeviceGetNvLinkState);                                                      \

  __op__(nvmlDeviceGetNvLinkVersion);                                                    \

  __op__(nvmlDeviceGetNvLinkRemotePciInfo);                                              \

  __op__(nvmlDeviceGetNvLinkRemoteDeviceType);                                           \

  __op__(nvmlDeviceGetDeviceHandleFromMigDeviceHandle);                                  \

  __op__(nvmlDeviceGetFieldValues);                                                      \

  NVML_11_APIS(__op__);                                                                  \

  NVML_12_APIS(__op__);


#define DECL_FNPTR_EXTERN(name) extern decltype(&name) name##_fnptr;

    NVML_APIS(DECL_FNPTR_EXTERN)

#undef DECL_FNPTR_EXTERN


#define CUPTI_APIS(__op__)                                                               \

  __op__(cuptiActivityRegisterCallbacks);                                                \

  __op__(cuptiActivityEnable);                                                           \

  __op__(cuptiActivityDisable);                                                          \

  __op__(cuptiActivityEnableContext);                                                    \

  __op__(cuptiActivityDisableContext);                                                   \

  __op__(cuptiActivityFlushAll);                                                         \

  __op__(cuptiActivityGetNextRecord);                                                    \

  __op__(cuptiActivityRegisterTimestampCallback);                                        \

  __op__(cuptiActivityPushExternalCorrelationId);                                        \

  __op__(cuptiActivityPopExternalCorrelationId);                                         \

  __op__(cuptiSubscribe);                                                                \

  __op__(cuptiEnableCallback);                                                           \

  __op__(cuptiUnsubscribe);


#define DECL_FNPTR_EXTERN(name) extern decltype(&name) name##_fnptr;

    CUPTI_APIS(DECL_FNPTR_EXTERN)

#undef DECL_FNPTR_EXTERN


#define CUPTI_HAS_FNPTR(name) (name##_fnptr != nullptr)

#define CUPTI_FNPTR(name) (assert(name##_fnptr != nullptr), name##_fnptr)


  }; // namespace Cuda


}; // namespace Realm


#endif

bgwork.h

handle
bootstrap_handle_t * handle
Definition bootstrap.h:61

channel.h

circ_queue.h

Realm::AddressListCursor
Definition address_list.h:101

Realm::BackgroundWorkItem
Definition bgwork.h:129

Realm::BackgroundWorkManager
Definition bgwork.h:36

Realm::ByteArrayRef
Definition bytearray.h:30

Realm::ByteArray
Definition bytearray.h:53

Realm::Channel
Definition channel.h:712

Realm::CircularQueue
Definition circ_queue.h:35

Realm::CodeDescriptor
Definition codedesc.h:249

Realm::CoreReservationSet
Definition threads.h:382

Realm::CoreReservation
Definition threads.h:342

Realm::Cuda::AddressInfoCudaArray
Definition cuda_internal.h:781

Realm::Cuda::AddressInfoCudaArray::dim
int dim
Definition cuda_internal.h:789

Realm::Cuda::AddressInfoCudaArray::array
CUarray array
Definition cuda_internal.h:788

Realm::Cuda::AddressInfoCudaArray::width_in_bytes
size_t width_in_bytes
Definition cuda_internal.h:791

Realm::Cuda::AddressInfoCudaArray::height
size_t height
Definition cuda_internal.h:791

Realm::Cuda::AddressInfoCudaArray::pos
size_t pos[3]
Definition cuda_internal.h:790

Realm::Cuda::AddressInfoCudaArray::depth
size_t depth
Definition cuda_internal.h:791

Realm::Cuda::AddressInfoCudaArray::set_rect
virtual int set_rect(const RegionInstanceImpl *inst, const InstanceLayoutPieceBase *piece, size_t field_size, size_t field_offset, int ndims, const int64_t lo[], const int64_t hi[], const int order[])

Realm::Cuda::AutoGPUContext
Definition cuda_internal.h:543

Realm::Cuda::AutoGPUContext::~AutoGPUContext
~AutoGPUContext(void)

Realm::Cuda::AutoGPUContext::AutoGPUContext
AutoGPUContext(GPU *_gpu)

Realm::Cuda::AutoGPUContext::AutoGPUContext
AutoGPUContext(GPU &_gpu)

Realm::Cuda::AutoGPUContext::gpu
GPU * gpu
Definition cuda_internal.h:550

Realm::Cuda::ContextSynchronizer
Definition cuda_internal.h:360

Realm::Cuda::ContextSynchronizer::mutex
Mutex mutex
Definition cuda_internal.h:376

Realm::Cuda::ContextSynchronizer::worker_threads
std::vector< Thread * > worker_threads
Definition cuda_internal.h:381

Realm::Cuda::ContextSynchronizer::thread_main
void thread_main()

Realm::Cuda::ContextSynchronizer::total_threads
int total_threads
Definition cuda_internal.h:380

Realm::Cuda::ContextSynchronizer::shutdown_threads
void shutdown_threads()

Realm::Cuda::ContextSynchronizer::core_rsrv
CoreReservation * core_rsrv
Definition cuda_internal.h:382

Realm::Cuda::ContextSynchronizer::max_threads
int max_threads
Definition cuda_internal.h:375

Realm::Cuda::ContextSynchronizer::add_fence
void add_fence(GPUWorkFence *fence)

Realm::Cuda::ContextSynchronizer::ContextSynchronizer
ContextSynchronizer(GPU *_gpu, CUcontext _context, CoreReservationSet &crs, int _max_threads)

Realm::Cuda::ContextSynchronizer::gpu
GPU * gpu
Definition cuda_internal.h:373

Realm::Cuda::ContextSynchronizer::syncing_threads
int syncing_threads
Definition cuda_internal.h:380

Realm::Cuda::ContextSynchronizer::condvar
Mutex::CondVar condvar
Definition cuda_internal.h:377

Realm::Cuda::ContextSynchronizer::context
CUcontext context
Definition cuda_internal.h:374

Realm::Cuda::ContextSynchronizer::sleeping_threads
int sleeping_threads
Definition cuda_internal.h:380

Realm::Cuda::ContextSynchronizer::~ContextSynchronizer
~ContextSynchronizer()

Realm::Cuda::ContextSynchronizer::shutdown_flag
bool shutdown_flag
Definition cuda_internal.h:378

Realm::Cuda::ContextSynchronizer::fences
GPUWorkFence::FenceList fences
Definition cuda_internal.h:379

Realm::Cuda::CudaDeviceMemoryInfo
Definition cuda_internal.h:592

Realm::Cuda::CudaDeviceMemoryInfo::CudaDeviceMemoryInfo
CudaDeviceMemoryInfo(CUcontext _context)

Realm::Cuda::CudaDeviceMemoryInfo::gpu
GPU * gpu
Definition cuda_internal.h:597

Realm::Cuda::CudaDeviceMemoryInfo::context
CUcontext context
Definition cuda_internal.h:596

Realm::Cuda::CudaModule
Definition cuda_module.h:182

Realm::Cuda::GPUAllocation
Class for managing the lifetime of a given gpu allocation. As instances of this class own an underlyi...
Definition cuda_internal.h:1148

Realm::Cuda::GPUAllocation::open_handle
static GPUAllocation * open_handle(GPU *gpu, OsHandle hdl, size_t size, bool peer_enabled=true)
Retrieves the GPUAllocation given the OsHandle.

Realm::Cuda::GPUAllocation::register_allocation
static GPUAllocation * register_allocation(GPU *gpu, void *ptr, size_t size, bool peer_enabled=true)
Create an allocation that registers the given CPU address range with CUDA, making it accessible from ...

Realm::Cuda::GPUAllocation::allocate_host
static GPUAllocation * allocate_host(GPU *gpu, size_t size, bool peer_enabled=true, bool shareable=true, bool same_va=true)
Allocate CPU-located memory for the given gpu with the given size and features.

Realm::Cuda::GPUAllocation::get_hptr
T * get_hptr(void) const
Retrieves the CPU accessible base address for the allocation, or nullptr if there is no way to access...
Definition cuda_internal.h:1207

Realm::Cuda::GPUAllocation::allocate_managed
static GPUAllocation * allocate_managed(GPU *gpu, size_t size)
Allocate migratable memory that can be used with CUDA's managed memory APIs (cuMemPrefetchAsync,...

Realm::Cuda::GPUAllocation::GPUAllocation
GPUAllocation(const GPUAllocation &)=delete

Realm::Cuda::GPUAllocation::get_size
size_t get_size(void) const
Retrieves the given size of the allocation.
Definition cuda_internal.h:1199

Realm::Cuda::GPUAllocation::get_os_handle
OsHandle get_os_handle(void) const
Accessor for the file descriptor or win32 HANDLE associated with the allocation. This handle can be s...

Realm::Cuda::GPUAllocation::get_ipc_handle
bool get_ipc_handle(CUipcMemHandle &handle) const
Retrieves the CUipcMemHandle for this allocation that can be used with GPUAllocation::open_ipc.
Definition cuda_internal.h:1171

Realm::Cuda::GPUAllocation::allocate_dev
static GPUAllocation * allocate_dev(GPU *gpu, size_t size, bool peer_enabled=true, bool shareable=true)
Allocates device-located memory for the given gpu with the given size and features.

Realm::Cuda::GPUAllocation::operator=
GPUAllocation & operator=(GPUAllocation &&) noexcept

Realm::Cuda::GPUAllocation::GPUAllocation
GPUAllocation(void)=default

Realm::Cuda::GPUAllocation::get_win32_shared_attributes
static void * get_win32_shared_attributes(void)
Retrieves the default win32 shared attributes for creating a shared object that can be set in CUmemAl...

Realm::Cuda::GPUAllocation::open_ipc
static GPUAllocation * open_ipc(GPU *gpu, const CUipcMemHandle &mem_hdl)
Retrieves the GPUAllocation given the CUipcMemHandle.

Realm::Cuda::GPUAllocation::GPUAllocation
GPUAllocation(GPUAllocation &&other) noexcept

Realm::Cuda::GPUAllocation::get_dptr
CUdeviceptr get_dptr(void) const
Retrieves the base CUdeviceptr for the associated allocation that can be used to access the underlyin...
Definition cuda_internal.h:1193

Realm::Cuda::GPUAllocation::get_gpu
GPU * get_gpu(void) const
Retrieves the owning GPU.
Definition cuda_internal.h:1196

Realm::Cuda::GPUChannel
Definition cuda_internal.h:924

Realm::Cuda::GPUChannel::support_idindexed_fields
virtual bool support_idindexed_fields(Memory src_mem, Memory dst_mem) const
Definition cuda_internal.h:946

Realm::Cuda::GPUChannel::GPUChannel
GPUChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork)

Realm::Cuda::GPUChannel::is_ordered
static const bool is_ordered
Definition cuda_internal.h:931

Realm::Cuda::GPUChannel::submit
long submit(Request **requests, long nr)

Realm::Cuda::GPUChannel::construct_remote_info
virtual RemoteChannelInfo * construct_remote_info() const

Realm::Cuda::GPUChannel::get_gpu
GPU * get_gpu() const
Definition cuda_internal.h:942

Realm::Cuda::GPUChannel::~GPUChannel
~GPUChannel()

Realm::Cuda::GPUChannel::create_xfer_des
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)

Realm::Cuda::GPUCompletionEvent
Definition cuda_internal.h:718

Realm::Cuda::GPUCompletionEvent::req
GPURequest * req
Definition cuda_internal.h:722

Realm::Cuda::GPUCompletionEvent::request_completed
void request_completed(void)

Realm::Cuda::GPUCompletionNotification
Definition cuda_internal.h:196

Realm::Cuda::GPUCompletionNotification::~GPUCompletionNotification
virtual ~GPUCompletionNotification(void)
Definition cuda_internal.h:198

Realm::Cuda::GPUCompletionNotification::request_completed
virtual void request_completed(void)=0

Realm::Cuda::GPUContextManager
Definition cuda_internal.h:183

Realm::Cuda::GPUContextManager::destroy_context
void destroy_context(InternalTask *task, void *context) const override

Realm::Cuda::GPUContextManager::destroy_context
void destroy_context(Task *task, void *context) const override

Realm::Cuda::GPUContextManager::gpu
GPU * gpu
Definition cuda_internal.h:190

Realm::Cuda::GPUContextManager::create_context
void * create_context(Task *task) const override

Realm::Cuda::GPUContextManager::GPUContextManager
GPUContextManager(GPU *_gpu, GPUProcessor *proc)

Realm::Cuda::GPUContextManager::create_context
void * create_context(InternalTask *task) const override

Realm::Cuda::GPUContextManager::proc
GPUProcessor * proc
Definition cuda_internal.h:191

Realm::Cuda::GPUDynamicFBMemory
Definition cuda_internal.h:631

Realm::Cuda::GPUDynamicFBMemory::gpu
GPU * gpu
Definition cuda_internal.h:667

Realm::Cuda::GPUDynamicFBMemory::release_storage_immediate
virtual void release_storage_immediate(RegionInstanceImpl *inst, bool poisoned, TimeLimit work_until)

Realm::Cuda::GPUDynamicFBMemory::cur_size
size_t cur_size
Definition cuda_internal.h:669

Realm::Cuda::GPUDynamicFBMemory::cleanup
void cleanup(void)

Realm::Cuda::GPUDynamicFBMemory::local_segment
NetworkSegment local_segment
Definition cuda_internal.h:671

Realm::Cuda::GPUDynamicFBMemory::get_direct_ptr
virtual void * get_direct_ptr(off_t offset, size_t size)

Realm::Cuda::GPUDynamicFBMemory::unregister_external_resource
virtual void unregister_external_resource(RegionInstanceImpl *inst)

Realm::Cuda::GPUDynamicFBMemory::allocate_storage_immediate
virtual AllocationResult allocate_storage_immediate(RegionInstanceImpl *inst, bool need_alloc_result, bool poisoned, TimeLimit work_until)

Realm::Cuda::GPUDynamicFBMemory::GPUDynamicFBMemory
GPUDynamicFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, size_t _max_size)

Realm::Cuda::GPUDynamicFBMemory::mutex
Mutex mutex
Definition cuda_internal.h:668

Realm::Cuda::GPUDynamicFBMemory::alloc_bases
std::map< RegionInstance, std::pair< CUdeviceptr, size_t > > alloc_bases
Definition cuda_internal.h:670

Realm::Cuda::GPUDynamicFBMemory::attempt_register_external_resource
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)

Realm::Cuda::GPUDynamicFBMemory::get_bytes
virtual void get_bytes(off_t offset, void *dst, size_t size)

Realm::Cuda::GPUDynamicFBMemory::~GPUDynamicFBMemory
virtual ~GPUDynamicFBMemory(void)

Realm::Cuda::GPUDynamicFBMemory::put_bytes
virtual void put_bytes(off_t offset, const void *src, size_t size)

Realm::Cuda::GPUDynamicFBMemory::generate_resource_info
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)

Realm::Cuda::GPUEventPool
Definition cuda_internal.h:337

Realm::Cuda::GPUEventPool::get_event
CUevent get_event(bool external=false)

Realm::Cuda::GPUEventPool::mutex
Mutex mutex
Definition cuda_internal.h:351

Realm::Cuda::GPUEventPool::init_pool
void init_pool(int init_size=0)

Realm::Cuda::GPUEventPool::batch_size
int batch_size
Definition cuda_internal.h:352

Realm::Cuda::GPUEventPool::return_event
void return_event(CUevent e, bool external=false)

Realm::Cuda::GPUEventPool::available_events
std::vector< CUevent > available_events
Definition cuda_internal.h:353

Realm::Cuda::GPUEventPool::GPUEventPool
GPUEventPool(int _batch_size=256)

Realm::Cuda::GPUEventPool::total_size
int total_size
Definition cuda_internal.h:352

Realm::Cuda::GPUEventPool::empty_pool
void empty_pool(void)

Realm::Cuda::GPUEventPool::current_size
int current_size
Definition cuda_internal.h:352

Realm::Cuda::GPUEventPool::external_count
int external_count
Definition cuda_internal.h:352

Realm::Cuda::GPUFBIBMemory
Definition cuda_internal.h:705

Realm::Cuda::GPUFBIBMemory::GPUFBIBMemory
GPUFBIBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base, size_t _size)

Realm::Cuda::GPUFBIBMemory::local_segment
NetworkSegment local_segment
Definition cuda_internal.h:713

Realm::Cuda::GPUFBIBMemory::base
CUdeviceptr base
Definition cuda_internal.h:712

Realm::Cuda::GPUFBIBMemory::gpu
GPU * gpu
Definition cuda_internal.h:711

Realm::Cuda::GPUFBMemory
Definition cuda_internal.h:600

Realm::Cuda::GPUFBMemory::local_segment
NetworkSegment local_segment
Definition cuda_internal.h:628

Realm::Cuda::GPUFBMemory::gpu
GPU * gpu
Definition cuda_internal.h:626

Realm::Cuda::GPUFBMemory::attempt_register_external_resource
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)

Realm::Cuda::GPUFBMemory::get_direct_ptr
virtual void * get_direct_ptr(off_t offset, size_t size)

Realm::Cuda::GPUFBMemory::base
CUdeviceptr base
Definition cuda_internal.h:627

Realm::Cuda::GPUFBMemory::put_bytes
virtual void put_bytes(off_t offset, const void *src, size_t size)

Realm::Cuda::GPUFBMemory::GPUFBMemory
GPUFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base, size_t _size)

Realm::Cuda::GPUFBMemory::generate_resource_info
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)

Realm::Cuda::GPUFBMemory::get_bytes
virtual void get_bytes(off_t offset, void *dst, size_t size)

Realm::Cuda::GPUFBMemory::unregister_external_resource
virtual void unregister_external_resource(RegionInstanceImpl *inst)

Realm::Cuda::GPUFBMemory::~GPUFBMemory
virtual ~GPUFBMemory(void)

Realm::Cuda::GPUIndirectChannel
Definition cuda_internal.h:848

Realm::Cuda::GPUIndirectChannel::get_gpu
GPU * get_gpu() const
Definition cuda_internal.h:880

Realm::Cuda::GPUIndirectChannel::is_ordered
static const bool is_ordered
Definition cuda_internal.h:855

Realm::Cuda::GPUIndirectChannel::src_gpu
GPU * src_gpu
Definition cuda_internal.h:884

Realm::Cuda::GPUIndirectChannel::supports_indirection_memory
virtual bool supports_indirection_memory(Memory mem) const
Queries if a given mem can be used as an indirection buffer.

Realm::Cuda::GPUIndirectChannel::create_xfer_des
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)

Realm::Cuda::GPUIndirectChannel::suggest_ib_memories
virtual Memory suggest_ib_memories() const

Realm::Cuda::GPUIndirectChannel::GPUIndirectChannel
GPUIndirectChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork)

Realm::Cuda::GPUIndirectChannel::submit
long submit(Request **requests, long nr)

Realm::Cuda::GPUIndirectChannel::construct_remote_info
virtual RemoteChannelInfo * construct_remote_info() const

Realm::Cuda::GPUIndirectChannel::needs_wrapping_iterator
virtual bool needs_wrapping_iterator() const

Realm::Cuda::GPUIndirectChannel::~GPUIndirectChannel
~GPUIndirectChannel()

Realm::Cuda::GPUIndirectChannel::supports_path
virtual uint64_t supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id, CustomSerdezID dst_serdez_id, ReductionOpID redop_id, size_t total_bytes, const std::vector< size_t > *src_frags, const std::vector< size_t > *dst_frags, XferDesKind *kind_ret=0, unsigned *bw_ret=0, unsigned *lat_ret=0)

Realm::Cuda::GPUIndirectRemoteChannelInfo
Definition cuda_internal.h:887

Realm::Cuda::GPUIndirectRemoteChannelInfo::serdez_subclass
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPUIndirectRemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:905

Realm::Cuda::GPUIndirectRemoteChannelInfo::serialize
bool serialize(S &serializer) const

Realm::Cuda::GPUIndirectRemoteChannelInfo::GPUIndirectRemoteChannelInfo
GPUIndirectRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths, const std::vector< Memory > &_indirect_memories)

Realm::Cuda::GPUIndirectRemoteChannelInfo::create_remote_channel
virtual RemoteChannel * create_remote_channel()

Realm::Cuda::GPUIndirectRemoteChannelInfo::deserialize_new
static RemoteChannelInfo * deserialize_new(S &deserializer)

Realm::Cuda::GPUIndirectRemoteChannel
Definition cuda_internal.h:908

Realm::Cuda::GPUIndirectRemoteChannel::needs_wrapping_iterator
virtual bool needs_wrapping_iterator() const

Realm::Cuda::GPUIndirectRemoteChannel::suggest_ib_memories
virtual Memory suggest_ib_memories() const

Realm::Cuda::GPUIndirectRemoteChannel::supports_path
virtual uint64_t supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id, CustomSerdezID dst_serdez_id, ReductionOpID redop_id, size_t total_bytes, const std::vector< size_t > *src_frags, const std::vector< size_t > *dst_frags, XferDesKind *kind_ret, unsigned *bw_ret, unsigned *lat_ret)

Realm::Cuda::GPUIndirectRemoteChannel::GPUIndirectRemoteChannel
GPUIndirectRemoteChannel(uintptr_t _remote_ptr, const std::vector< Memory > &_indirect_memories)

Realm::Cuda::GPUIndirectTransferCompletion
Definition cuda_internal.h:734

Realm::Cuda::GPUIndirectTransferCompletion::read_ind_offset
size_t read_ind_offset
Definition cuda_internal.h:750

Realm::Cuda::GPUIndirectTransferCompletion::write_size
size_t write_size
Definition cuda_internal.h:752

Realm::Cuda::GPUIndirectTransferCompletion::write_offset
size_t write_offset
Definition cuda_internal.h:752

Realm::Cuda::GPUIndirectTransferCompletion::write_ind_port_idx
int write_ind_port_idx
Definition cuda_internal.h:753

Realm::Cuda::GPUIndirectTransferCompletion::read_offset
size_t read_offset
Definition cuda_internal.h:748

Realm::Cuda::GPUIndirectTransferCompletion::write_ind_offset
size_t write_ind_offset
Definition cuda_internal.h:754

Realm::Cuda::GPUIndirectTransferCompletion::read_ind_size
size_t read_ind_size
Definition cuda_internal.h:750

Realm::Cuda::GPUIndirectTransferCompletion::GPUIndirectTransferCompletion
GPUIndirectTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size, int _write_port_idx, size_t _write_offset, size_t _write_size, int _read_ind_port_idx=-1, size_t _read_ind_offset=0, size_t _read_ind_size=0, int _write_ind_port_idx=-1, size_t _write_ind_offset=0, size_t _write_ind_size=0)

Realm::Cuda::GPUIndirectTransferCompletion::write_port_idx
int write_port_idx
Definition cuda_internal.h:751

Realm::Cuda::GPUIndirectTransferCompletion::request_completed
virtual void request_completed(void)

Realm::Cuda::GPUIndirectTransferCompletion::write_ind_size
size_t write_ind_size
Definition cuda_internal.h:754

Realm::Cuda::GPUIndirectTransferCompletion::read_ind_port_idx
int read_ind_port_idx
Definition cuda_internal.h:749

Realm::Cuda::GPUIndirectTransferCompletion::xd
XferDes * xd
Definition cuda_internal.h:746

Realm::Cuda::GPUIndirectTransferCompletion::read_size
size_t read_size
Definition cuda_internal.h:748

Realm::Cuda::GPUIndirectTransferCompletion::read_port_idx
int read_port_idx
Definition cuda_internal.h:747

Realm::Cuda::GPUIndirectXferDes
Definition cuda_internal.h:832

Realm::Cuda::GPUIndirectXferDes::progress_xd
bool progress_xd(GPUIndirectChannel *channel, TimeLimit work_until)

Realm::Cuda::GPUIndirectXferDes::dst_is_ipc
std::vector< bool > dst_is_ipc
Definition cuda_internal.h:844

Realm::Cuda::GPUIndirectXferDes::dst_gpus
std::vector< GPU * > dst_gpus
Definition cuda_internal.h:843

Realm::Cuda::GPUIndirectXferDes::get_requests
long get_requests(Request **requests, long nr)

Realm::Cuda::GPUIndirectXferDes::src_gpus
std::vector< GPU * > src_gpus
Definition cuda_internal.h:843

Realm::Cuda::GPUIndirectXferDes::GPUIndirectXferDes
GPUIndirectXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, XferDesRedopInfo _redop_info)

Realm::Cuda::GPUProcessor
Definition cuda_internal.h:554

Realm::Cuda::GPUProcessor::shutdown
virtual void shutdown(void)

Realm::Cuda::GPUProcessor::core_rsrv
Realm::CoreReservation * core_rsrv
Definition cuda_internal.h:574

Realm::Cuda::GPUProcessor::register_task
virtual bool register_task(Processor::TaskFuncID func_id, CodeDescriptor &codedesc, const ByteArrayRef &user_data)

Realm::Cuda::GPUProcessor::~GPUProcessor
virtual ~GPUProcessor(void)

Realm::Cuda::GPUProcessor::gpu
GPU * gpu
Definition cuda_internal.h:571

Realm::Cuda::GPUProcessor::GPUProcessor
GPUProcessor(RuntimeImpl *runtime_impl, GPU *_gpu, Processor _me, Realm::CoreReservationSet &crs, size_t _stack_size)

Realm::Cuda::GPUProcessor::gpu_task_table
std::map< Processor::TaskFuncID, GPUTaskTableEntry > gpu_task_table
Definition cuda_internal.h:584

Realm::Cuda::GPUProcessor::execute_task
virtual void execute_task(Processor::TaskFuncID func_id, const ByteArrayRef &task_args)

Realm::Cuda::GPURemoteChannelInfo
Definition cuda_internal.h:955

Realm::Cuda::GPURemoteChannelInfo::create_remote_channel
virtual RemoteChannel * create_remote_channel()

Realm::Cuda::GPURemoteChannelInfo::GPURemoteChannelInfo
GPURemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths)

Realm::Cuda::GPURemoteChannelInfo::serdez_subclass
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPURemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:971

Realm::Cuda::GPURemoteChannelInfo::deserialize_new
static RemoteChannelInfo * deserialize_new(S &deserializer)

Realm::Cuda::GPURemoteChannelInfo::serialize
bool serialize(S &serializer) const

Realm::Cuda::GPURemoteChannel
Definition cuda_internal.h:974

Realm::Cuda::GPURemoteChannel::support_idindexed_fields
virtual bool support_idindexed_fields(Memory src_mem, Memory dst_mem) const
Definition cuda_internal.h:980

Realm::Cuda::GPUReplHeapListener
Definition cuda_internal.h:1134

Realm::Cuda::GPUReplHeapListener::GPUReplHeapListener
GPUReplHeapListener(CudaModule *_module)

Realm::Cuda::GPUReplHeapListener::chunk_created
virtual void chunk_created(void *base, size_t bytes)

Realm::Cuda::GPUReplHeapListener::chunk_destroyed
virtual void chunk_destroyed(void *base, size_t bytes)

Realm::Cuda::GPURequest
Definition cuda_internal.h:725

Realm::Cuda::GPURequest::event
GPUCompletionEvent event
Definition cuda_internal.h:731

Realm::Cuda::GPURequest::dst_base
void * dst_base
Definition cuda_internal.h:728

Realm::Cuda::GPURequest::src_base
const void * src_base
Definition cuda_internal.h:727

Realm::Cuda::GPURequest::dst_gpu
GPU * dst_gpu
Definition cuda_internal.h:730

Realm::Cuda::GPUStream
Definition cuda_internal.h:247

Realm::Cuda::GPUStream::ok_to_submit_copy
bool ok_to_submit_copy(size_t bytes, XferDes *xd)

Realm::Cuda::GPUStream::get_stream
REALM_INTERNAL_API_EXTERNAL_LINKAGE CUstream get_stream(void) const

Realm::Cuda::GPUStream::add_notification
void add_notification(GPUCompletionNotification *notification)

Realm::Cuda::GPUStream::add_event
void add_event(CUevent event, GPUWorkFence *fence, GPUCompletionNotification *notification=NULL, GPUWorkStart *start=NULL)

Realm::Cuda::GPUStream::mutex
Mutex mutex
Definition cuda_internal.h:281

Realm::Cuda::GPUStream::add_start_event
void add_start_event(GPUWorkStart *start)

Realm::Cuda::GPUStream::has_work
bool has_work(void) const

Realm::Cuda::GPUStream::gpu
GPU * gpu
Definition cuda_internal.h:276

Realm::Cuda::GPUStream::get_gpu
GPU * get_gpu(void) const

Realm::Cuda::GPUStream::~GPUStream
~GPUStream(void)

Realm::Cuda::GPUStream::GPUStream
GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority=0)

Realm::Cuda::GPUStream::worker
GPUWorker * worker
Definition cuda_internal.h:277

Realm::Cuda::GPUStream::reap_events
bool reap_events(TimeLimit work_until)

Realm::Cuda::GPUStream::add_fence
void add_fence(GPUWorkFence *fence)

Realm::Cuda::GPUStream::pending_events
std::deque< PendingEvent > pending_events
Definition cuda_internal.h:291

Realm::Cuda::GPUStream::wait_on_streams
void wait_on_streams(const std::set< GPUStream * > &other_streams)

Realm::Cuda::GPUStream::stream
CUstream stream
Definition cuda_internal.h:279

Realm::Cuda::GPUTransferCompletion
Definition cuda_internal.h:757

Realm::Cuda::GPUTransferCompletion::read_offset
size_t read_offset
Definition cuda_internal.h:768

Realm::Cuda::GPUTransferCompletion::write_port_idx
int write_port_idx
Definition cuda_internal.h:769

Realm::Cuda::GPUTransferCompletion::xd
XferDes * xd
Definition cuda_internal.h:766

Realm::Cuda::GPUTransferCompletion::write_size
size_t write_size
Definition cuda_internal.h:770

Realm::Cuda::GPUTransferCompletion::write_offset
size_t write_offset
Definition cuda_internal.h:770

Realm::Cuda::GPUTransferCompletion::GPUTransferCompletion
GPUTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size, int _write_port_idx, size_t _write_offset, size_t _write_size)

Realm::Cuda::GPUTransferCompletion::request_completed
virtual void request_completed(void)

Realm::Cuda::GPUTransferCompletion::read_size
size_t read_size
Definition cuda_internal.h:768

Realm::Cuda::GPUTransferCompletion::read_port_idx
int read_port_idx
Definition cuda_internal.h:767

Realm::Cuda::GPUWorkFence
Definition cuda_internal.h:203

Realm::Cuda::GPUWorkFence::request_cancellation
virtual void request_cancellation(void)

Realm::Cuda::GPUWorkFence::fence_list_link
IntrusiveListLink< GPUWorkFence > fence_list_link
Definition cuda_internal.h:216

Realm::Cuda::GPUWorkFence::gpu
GPU * gpu
Definition cuda_internal.h:224

Realm::Cuda::GPUWorkFence::enqueue_on_stream
void enqueue_on_stream(GPUStream *stream)

Realm::Cuda::GPUWorkFence::print
virtual void print(std::ostream &os) const

Realm::Cuda::GPUWorkFence::FenceList
IntrusiveList< GPUWorkFence, REALM_PMTA_USE(GPUWorkFence, fence_list_link), DummyLock > FenceList
Definition cuda_internal.h:220

Realm::Cuda::GPUWorkFence::~GPUWorkFence
~GPUWorkFence()

Realm::Cuda::GPUWorkFence::GPUWorkFence
GPUWorkFence(GPU *gpu, Realm::Operation *op)

Realm::Cuda::GPUWorkFence::REALM_PMTA_DEFN
REALM_PMTA_DEFN(GPUWorkFence, IntrusiveListLink< GPUWorkFence >, fence_list_link)

Realm::Cuda::GPUWorkFence::cuda_callback
static void cuda_callback(CUstream stream, CUresult res, void *data)

Realm::Cuda::GPUWorkFence::mark_finished
virtual void mark_finished(bool successful)

Realm::Cuda::GPUWorkStart
Definition cuda_internal.h:227

Realm::Cuda::GPUWorkStart::mark_gpu_work_start
void mark_gpu_work_start()

Realm::Cuda::GPUWorkStart::enqueue_on_stream
void enqueue_on_stream(GPUStream *stream)

Realm::Cuda::GPUWorkStart::GPUWorkStart
GPUWorkStart(Realm::Operation *op)

Realm::Cuda::GPUWorkStart::request_cancellation
virtual void request_cancellation(void)
Definition cuda_internal.h:231

Realm::Cuda::GPUWorkStart::print
virtual void print(std::ostream &os) const

Realm::Cuda::GPUWorkStart::cuda_start_callback
static void cuda_start_callback(CUstream stream, CUresult res, void *data)

Realm::Cuda::GPUWorker
Definition cuda_internal.h:298

Realm::Cuda::GPUWorker::~GPUWorker
virtual ~GPUWorker(void)

Realm::Cuda::GPUWorker::process_streams
bool process_streams(bool sleep_on_empty)

Realm::Cuda::GPUWorker::ActiveStreamQueue
CircularQueue< GPUStream *, 16 > ActiveStreamQueue
Definition cuda_internal.h:325

Realm::Cuda::GPUWorker::shutdown_background_thread
void shutdown_background_thread(void)

Realm::Cuda::GPUWorker::active_streams
ActiveStreamQueue active_streams
Definition cuda_internal.h:326

Realm::Cuda::GPUWorker::GPUWorker
GPUWorker(void)

Realm::Cuda::GPUWorker::add_stream
void add_stream(GPUStream *s)

Realm::Cuda::GPUWorker::start_background_thread
void start_background_thread(Realm::CoreReservationSet &crs, size_t stack_size)

Realm::Cuda::GPUWorker::core_rsrv
Realm::CoreReservation * core_rsrv
Definition cuda_internal.h:329

Realm::Cuda::GPUWorker::condvar
Mutex::CondVar condvar
Definition cuda_internal.h:323

Realm::Cuda::GPUWorker::lock
Mutex lock
Definition cuda_internal.h:322

Realm::Cuda::GPUWorker::thread_main
void thread_main(void)

Realm::Cuda::GPUWorker::worker_shutdown_requested
atomic< bool > worker_shutdown_requested
Definition cuda_internal.h:332

Realm::Cuda::GPUWorker::do_work
bool do_work(TimeLimit work_until)

Realm::Cuda::GPUWorker::thread_sleeping
bool thread_sleeping
Definition cuda_internal.h:331

Realm::Cuda::GPUWorker::worker_thread
Realm::Thread * worker_thread
Definition cuda_internal.h:330

Realm::Cuda::GPUXferDes
Definition cuda_internal.h:796

Realm::Cuda::GPUXferDes::progress_xd
bool progress_xd(GPUChannel *channel, TimeLimit work_until)

Realm::Cuda::GPUXferDes::GPUXferDes
GPUXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority)

Realm::Cuda::GPUXferDes::read_address_entry
static size_t read_address_entry(AffineCopyInfo< 3 > &copy_infos, size_t &min_align, MemcpyTransposeInfo< size_t > &transpose_info, AddressListCursor &in_alc, uintptr_t in_base, AddressListCursor &out_alc, uintptr_t out_base, size_t bytes_left, size_t max_xfer_fields, size_t &fields_total)

Realm::Cuda::GPUXferDes::get_requests
long get_requests(Request **requests, long nr)

Realm::Cuda::GPUZCMemory
Definition cuda_internal.h:674

Realm::Cuda::GPUZCMemory::local_segment
NetworkSegment local_segment
Definition cuda_internal.h:702

Realm::Cuda::GPUZCMemory::~GPUZCMemory
virtual ~GPUZCMemory(void)

Realm::Cuda::GPUZCMemory::cpu_base
char * cpu_base
Definition cuda_internal.h:701

Realm::Cuda::GPUZCMemory::get_direct_ptr
virtual void * get_direct_ptr(off_t offset, size_t size)

Realm::Cuda::GPUZCMemory::attempt_register_external_resource
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)

Realm::Cuda::GPUZCMemory::unregister_external_resource
virtual void unregister_external_resource(RegionInstanceImpl *inst)

Realm::Cuda::GPUZCMemory::GPUZCMemory
GPUZCMemory(RuntimeImpl *_runtime_impl, GPU *gpu, Memory _me, CUdeviceptr _gpu_base, void *_cpu_base, size_t _size, MemoryKind _kind, Memory::Kind _lowlevel_kind)

Realm::Cuda::GPUZCMemory::generate_resource_info
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)

Realm::Cuda::GPUZCMemory::put_bytes
virtual void put_bytes(off_t offset, const void *src, size_t size)

Realm::Cuda::GPUZCMemory::gpu_base
CUdeviceptr gpu_base
Definition cuda_internal.h:700

Realm::Cuda::GPUZCMemory::get_bytes
virtual void get_bytes(off_t offset, void *dst, size_t size)

Realm::Cuda::GPU
Definition cuda_internal.h:392

Realm::Cuda::GPU::pop_context
void pop_context(void)

Realm::Cuda::GPU::create_fb_memory
void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size)

Realm::Cuda::GPU::launch_transpose_kernel
void launch_transpose_kernel(MemcpyTransposeInfo< size_t > &copy_info, size_t elemSize, GPUStream *stream)

Realm::Cuda::GPU::create_dynamic_fb_memory
void create_dynamic_fb_memory(RuntimeImpl *runtime, size_t max_size)

Realm::Cuda::GPU::fbmem_base
CUdeviceptr fbmem_base
Definition cuda_internal.h:482

Realm::Cuda::GPU::fill_affine_large_kernels
GPUFuncInfo fill_affine_large_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:479

Realm::Cuda::GPU::cudaipc_mappings
std::vector< CudaIpcMapping > cudaipc_mappings
Definition cuda_internal.h:519

Realm::Cuda::GPU::launch_batch_affine_kernel
void launch_batch_affine_kernel(void *copy_info, size_t dim, size_t elemSize, size_t volume, bool multified_optimized, GPUStream *stream)

Realm::Cuda::GPU::can_access_peer
bool can_access_peer(const GPU *peer) const

Realm::Cuda::GPU::fbmem
GPUFBMemory * fbmem
Definition cuda_internal.h:449

Realm::Cuda::GPU::transpose_kernels
GPUFuncInfo transpose_kernels[CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:480

Realm::Cuda::GPU::gpu_reduction_table
std::unordered_map< ReductionOpID, GPUReductionOpEntry > gpu_reduction_table
Definition cuda_internal.h:539

Realm::Cuda::GPU::get_null_task_stream
REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUStream * get_null_task_stream(void) const

Realm::Cuda::GPU::device_to_host_stream
GPUStream * device_to_host_stream
Definition cuda_internal.h:497

Realm::Cuda::GPU::proc
GPUProcessor * proc
Definition cuda_internal.h:446

Realm::Cuda::GPU::batch_affine_kernels
GPUFuncInfo batch_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:473

Realm::Cuda::GPU::task_streams
std::vector< GPUStream * > task_streams
Definition cuda_internal.h:501

Realm::Cuda::GPU::ctxsync
ContextSynchronizer ctxsync
Definition cuda_internal.h:442

Realm::Cuda::GPU::device_module
CUmodule device_module
Definition cuda_internal.h:455

Realm::Cuda::GPU::create_processor
void create_processor(RuntimeImpl *runtime, size_t stack_size)

Realm::Cuda::GPU::alloc_mutex
Mutex alloc_mutex
Definition cuda_internal.h:521

Realm::Cuda::GPU::managed_mems
std::set< Memory > managed_mems
Definition cuda_internal.h:490

Realm::Cuda::GPU::host_to_device_stream
GPUStream * host_to_device_stream
Definition cuda_internal.h:496

Realm::Cuda::GPU::get_next_d2d_stream
GPUStream * get_next_d2d_stream()

Realm::Cuda::GPU::indirect_copy_kernels
GPUFuncInfo indirect_copy_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:472

Realm::Cuda::GPU::~GPU
~GPU(void)

Realm::Cuda::GPU::multi_batch_affine_kernels
GPUFuncInfo multi_batch_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:475

Realm::Cuda::GPU::device_to_device_stream
GPUStream * device_to_device_stream
Definition cuda_internal.h:498

Realm::Cuda::GPU::cupti_activity_refcount
size_t cupti_activity_refcount
Definition cuda_internal.h:504

Realm::Cuda::GPU::cudaipc_streams
std::map< NodeID, GPUStream * > cudaipc_streams
Definition cuda_internal.h:520

Realm::Cuda::GPU::allocations
std::map< CUdeviceptr, GPUAllocation > allocations
Definition cuda_internal.h:448

Realm::Cuda::GPU::push_context
void push_context(void)

Realm::Cuda::GPU::pinned_sysmems
std::set< Memory > pinned_sysmems
Definition cuda_internal.h:487

Realm::Cuda::GPU::batch_fill_affine_kernels
GPUFuncInfo batch_fill_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:477

Realm::Cuda::GPU::CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES
static const size_t CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES
Definition cuda_internal.h:469

Realm::Cuda::GPU::get_next_task_stream
GPUStream * get_next_task_stream(bool create=false)

Realm::Cuda::GPU::launch_indirect_copy_kernel
void launch_indirect_copy_kernel(void *copy_info, size_t dim, size_t addr_size, size_t field_size, size_t volume, GPUStream *stream)

Realm::Cuda::GPU::load_cuda_module
CUmodule load_cuda_module(const void *data)

Realm::Cuda::GPU::least_stream_priority
int least_stream_priority
Definition cuda_internal.h:510

Realm::Cuda::GPU::fb_ibmem_base
CUdeviceptr fb_ibmem_base
Definition cuda_internal.h:484

Realm::Cuda::GPU::register_reduction
bool register_reduction(ReductionOpID redop_id, CUfunction apply_excl, CUfunction apply_nonexcl, CUfunction fold_excl, CUfunction fold_nonexcl, CUfunction apply_excl_advanced, CUfunction apply_nonexcl_advanced, CUfunction fold_excl_advanced, CUfunction fold_nonexcl_advanced, CUfunction apply_excl_transpose, CUfunction apply_nonexcl_transpose, CUfunction fold_excl_transpose, CUfunction fold_nonexcl_transpose)

Realm::Cuda::GPU::add_allocation
GPUAllocation & add_allocation(GPUAllocation &&alloc)

Realm::Cuda::GPU::fb_dmem
GPUDynamicFBMemory * fb_dmem
Definition cuda_internal.h:450

Realm::Cuda::GPU::launch_batch_affine_fill_kernel
void launch_batch_affine_fill_kernel(void *fill_info, size_t dim, size_t elemSize, size_t volume, GPUStream *stream)

Realm::Cuda::GPU::greatest_stream_priority
int greatest_stream_priority
Definition cuda_internal.h:510

Realm::Cuda::GPU::find_ipc_mapping
const CudaIpcMapping * find_ipc_mapping(Memory mem) const

Realm::Cuda::GPU::event_pool
GPUEventPool event_pool
Definition cuda_internal.h:506

Realm::Cuda::GPU::context
CUcontext context
Definition cuda_internal.h:453

Realm::Cuda::GPU::GPU
GPU(CudaModule *_module, GPUInfo *_info, GPUWorker *worker, CUcontext _context)

Realm::Cuda::GPU::device_to_device_streams
std::vector< GPUStream * > device_to_device_streams
Definition cuda_internal.h:499

Realm::Cuda::GPU::fb_ibmem
GPUFBIBMemory * fb_ibmem
Definition cuda_internal.h:451

Realm::Cuda::GPU::next_task_stream
atomic< unsigned > next_task_stream
Definition cuda_internal.h:502

Realm::Cuda::GPU::find_stream
GPUStream * find_stream(CUstream stream) const

Realm::Cuda::GPU::is_accessible_host_mem
bool is_accessible_host_mem(const MemoryImpl *mem) const

Realm::Cuda::GPU::info
GPUInfo * info
Definition cuda_internal.h:444

Realm::Cuda::GPU::peer_to_peer_streams
std::vector< GPUStream * > peer_to_peer_streams
Definition cuda_internal.h:500

Realm::Cuda::GPU::worker
GPUWorker * worker
Definition cuda_internal.h:445

Realm::Cuda::GPU::create_dma_channels
void create_dma_channels(Realm::RuntimeImpl *r)

Realm::Cuda::GPU::peer_fbs
std::set< Memory > peer_fbs
Definition cuda_internal.h:493

Realm::Cuda::GPU::next_d2d_stream
atomic< unsigned > next_d2d_stream
Definition cuda_internal.h:503

Realm::Cuda::GPU::is_accessible_gpu_mem
bool is_accessible_gpu_mem(const MemoryImpl *mem) const

Realm::Cuda::GPUfillChannel
Definition cuda_internal.h:1003

Realm::Cuda::GPUfillChannel::is_ordered
static const bool is_ordered
Definition cuda_internal.h:1008

Realm::Cuda::GPUfillChannel::GPUfillChannel
GPUfillChannel(GPU *_gpu, BackgroundWorkManager *bgwork)

Realm::Cuda::GPUfillChannel::create_xfer_des
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)

Realm::Cuda::GPUfillChannel::submit
long submit(Request **requests, long nr)

Realm::Cuda::GPUfillChannel::gpu
GPU * gpu
Definition cuda_internal.h:1023

Realm::Cuda::GPUfillXferDes
Definition cuda_internal.h:988

Realm::Cuda::GPUfillXferDes::GPUfillXferDes
GPUfillXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, const void *_fill_data, size_t _fill_size, size_t _fill_total)

Realm::Cuda::GPUfillXferDes::reduced_fill_size
size_t reduced_fill_size
Definition cuda_internal.h:1000

Realm::Cuda::GPUfillXferDes::get_requests
long get_requests(Request **requests, long nr)

Realm::Cuda::GPUfillXferDes::progress_xd
bool progress_xd(GPUfillChannel *channel, TimeLimit work_until)

Realm::Cuda::GPUreduceChannel
Definition cuda_internal.h:1072

Realm::Cuda::GPUreduceChannel::gpu
GPU * gpu
Definition cuda_internal.h:1096

Realm::Cuda::GPUreduceChannel::construct_remote_info
RemoteChannelInfo * construct_remote_info() const override

Realm::Cuda::GPUreduceChannel::is_ordered
static const bool is_ordered
Definition cuda_internal.h:1077

Realm::Cuda::GPUreduceChannel::create_xfer_des
XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total) override

Realm::Cuda::GPUreduceChannel::submit
long submit(Request **requests, long nr) override

Realm::Cuda::GPUreduceChannel::supports_redop
bool supports_redop(ReductionOpID redop_id) const override

Realm::Cuda::GPUreduceChannel::GPUreduceChannel
GPUreduceChannel(GPU *_gpu, BackgroundWorkManager *bgwork)

Realm::Cuda::GPUreduceRemoteChannelInfo
Definition cuda_internal.h:1099

Realm::Cuda::GPUreduceRemoteChannelInfo::GPUreduceRemoteChannelInfo
GPUreduceRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths)

Realm::Cuda::GPUreduceRemoteChannelInfo::serialize
bool serialize(S &serializer) const

Realm::Cuda::GPUreduceRemoteChannelInfo::create_remote_channel
virtual RemoteChannel * create_remote_channel()

Realm::Cuda::GPUreduceRemoteChannelInfo::serdez_subclass
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPUreduceRemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:1115

Realm::Cuda::GPUreduceRemoteChannelInfo::deserialize_new
static RemoteChannelInfo * deserialize_new(S &deserializer)

Realm::Cuda::GPUreduceRemoteChannel
Definition cuda_internal.h:1118

Realm::Cuda::GPUreduceXferDes
Definition cuda_internal.h:1033

Realm::Cuda::GPUreduceXferDes::get_requests
long get_requests(Request **requests, long nr)

Realm::Cuda::GPUreduceXferDes::kernel_host_proxy_advanced
const void * kernel_host_proxy_advanced
Definition cuda_internal.h:1065

Realm::Cuda::GPUreduceXferDes::src_is_ipc
std::vector< bool > src_is_ipc
Definition cuda_internal.h:1069

Realm::Cuda::GPUreduceXferDes::stream
GPUStream * stream
Definition cuda_internal.h:1067

Realm::Cuda::GPUreduceXferDes::record_redop_advanced_kernel
void record_redop_advanced_kernel(GPU *gpu)

Realm::Cuda::GPUreduceXferDes::GPUreduceXferDes
GPUreduceXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, XferDesRedopInfo _redop_info)

Realm::Cuda::GPUreduceXferDes::kernel_host_proxy
const void * kernel_host_proxy
Definition cuda_internal.h:1064

Realm::Cuda::GPUreduceXferDes::progress_xd
bool progress_xd(GPUreduceChannel *channel, TimeLimit work_until)

Realm::Cuda::GPUreduceXferDes::kernel_host_proxy_transpose
const void * kernel_host_proxy_transpose
Definition cuda_internal.h:1066

Realm::Cuda::GPUreduceXferDes::kernel
CUfunction kernel
Definition cuda_internal.h:1061

Realm::Cuda::GPUreduceXferDes::redop_info
XferDesRedopInfo redop_info
Definition cuda_internal.h:1059

Realm::Cuda::GPUreduceXferDes::src_gpus
std::vector< GPU * > src_gpus
Definition cuda_internal.h:1068

Realm::Cuda::GPUreduceXferDes::fast_reduction_kernel_mode
bool fast_reduction_kernel_mode(GPUreduceChannel *channel, const size_t max_bytes, XferPort *in_port, XferPort *out_port, const size_t in_span_start, const size_t out_span_start)

Realm::Cuda::GPUreduceXferDes::redop
const ReductionOpUntyped * redop
Definition cuda_internal.h:1060

Realm::Cuda::GPUreduceXferDes::describe_kernel_variant
KernelVariantDesc describe_kernel_variant(GPU *cpu, bool is_advanced)

Realm::Cuda::GPUreduceXferDes::kernel_transpose
CUfunction kernel_transpose
Definition cuda_internal.h:1063

Realm::Cuda::GPUreduceXferDes::setup_redop_kernel
void setup_redop_kernel(GPUreduceChannel *channel, void *params, const size_t in_span_start, const size_t out_span_start, const size_t in_elem_size, const size_t out_elem_size, const size_t elems, const bool has_transpose)

Realm::Cuda::GPUreduceXferDes::resolve_kernel_slot
bool resolve_kernel_slot(GPU *gpu, void *host_proxy, CUfunction &kernel_out, CUfunction GPU::GPUReductionOpEntry::*cache_field)

Realm::Cuda::GPUreduceXferDes::kernel_advanced
CUfunction kernel_advanced
Definition cuda_internal.h:1062

Realm::Cuda::MemSpecificCudaArray
Definition cuda_internal.h:773

Realm::Cuda::MemSpecificCudaArray::MemSpecificCudaArray
MemSpecificCudaArray(CUarray _array)

Realm::Cuda::MemSpecificCudaArray::array
CUarray array
Definition cuda_internal.h:778

Realm::Cuda::MemSpecificCudaArray::~MemSpecificCudaArray
virtual ~MemSpecificCudaArray()

Realm::DummyLock
Definition threads.h:428

Realm::ExternalInstanceResource
Definition instance.h:405

Realm::IBMemory
Definition ib_memory.h:30

Realm::IndexSpaceGeneric
Definition indexspace.h:1115

Realm::InstanceLayoutPieceBase
Definition inst_layout.h:267

Realm::InternalTask
Definition tasks.h:181

Realm::IntrusiveList
Definition lists.h:66

Realm::LocalManagedMemory
Definition mem_impl.h:344

Realm::LocalTaskProcessor
Definition proc_impl.h:141

Realm::MemSpecificInfo
Definition mem_impl.h:212

Realm::MemoryImpl
Definition mem_impl.h:50

Realm::MemoryImpl::MemoryKind
MemoryKind
Definition mem_impl.h:53

Realm::MemoryImpl::size
size_t size
Definition mem_impl.h:195

Realm::MemoryImpl::AllocationResult
AllocationResult
Definition mem_impl.h:89

Realm::Memory
Definition memory.h:33

Realm::Memory::Kind
Kind
Definition memory.h:59

Realm::ModuleSpecificInfo
Definition module.h:100

Realm::NetworkSegment
Definition network.h:409

Realm::Operation::AsyncWorkItem
Definition operation.h:75

Realm::Operation::AsyncWorkItem::op
Operation * op
Definition operation.h:87

Realm::Operation
Definition operation.h:32

Realm::Processor
Definition processor.h:37

Realm::Processor::TaskFuncID
::realm_task_func_id_t TaskFuncID
Definition processor.h:58

Realm::RegionInstanceImpl
Definition inst_impl.h:54

Realm::RemoteChannelInfo
Definition channel.h:895

Realm::RemoteChannel
Definition channel.h:938

Realm::ReplicatedHeap::Listener
Definition repl_heap.h:50

Realm::Request
Definition channel.h:102

Realm::RuntimeImpl
Definition runtime_impl.h:267

Realm::Serialization::PolymorphicSerdezSubclass
Definition serialize.h:363

Realm::SimpleRemoteChannelInfo
Definition channel.h:908

Realm::SingleXDQChannel
Definition channel.h:1018

Realm::TaskContextManager
Definition tasks.h:199

Realm::Task
Definition tasks.h:41

Realm::Thread
Definition threads.h:89

Realm::TimeLimit
Definition timers.h:129

Realm::TransferIterator::AddressInfoCustom
Definition transfer.h:95

Realm::UnfairCondVar
Definition mutex.h:325

Realm::UnfairMutex
Definition mutex.h:223

Realm::XferDes
Definition channel.h:285

Realm::XferDes::channel
Channel * channel
Definition channel.h:342

Realm::atomic
Definition atomics.h:31

Realm::span
Definition utils.h:84

REALM_INTERNAL_API_EXTERNAL_LINKAGE
#define REALM_INTERNAL_API_EXTERNAL_LINKAGE
Definition compiler_support.h:218

CUDA_DRIVER_APIS
#define CUDA_DRIVER_APIS(__op__)
Definition cuda_internal.h:1409

NVML_APIS
#define NVML_APIS(__op__)
Definition cuda_internal.h:1550

DECL_FNPTR_EXTERN
#define DECL_FNPTR_EXTERN(name, ver)
Definition cuda_internal.h:1511

CUPTI_APIS
#define CUPTI_APIS(__op__)
Definition cuda_internal.h:1568

cuda_memcpy.h

cuda_module.h

cudaDeviceProp
#define cudaDeviceProp
Definition hip_cuda.h:24

ib_memory.h

indexspace.h

REALM_PMTA_USE
#define REALM_PMTA_USE(structtype, name)
Definition lists.h:42

mem_impl.h

Realm::Cuda::cuda_module_singleton
CudaModule * cuda_module_singleton

Realm::Cuda::cuGetProcAddress
CUresult cuGetProcAddress(const char *, void **, int, int)

Realm::Cuda::cuCtxRecordEvent
CUresult cuCtxRecordEvent(CUcontext hctx, CUevent event)

Realm::Cuda::GPUMemcpyKind
GPUMemcpyKind
Definition cuda_internal.h:162

Realm::Cuda::GPU_MEMCPY_PEER_TO_PEER
@ GPU_MEMCPY_PEER_TO_PEER
Definition cuda_internal.h:166

Realm::Cuda::GPU_MEMCPY_HOST_TO_DEVICE
@ GPU_MEMCPY_HOST_TO_DEVICE
Definition cuda_internal.h:163

Realm::Cuda::GPU_MEMCPY_DEVICE_TO_HOST
@ GPU_MEMCPY_DEVICE_TO_HOST
Definition cuda_internal.h:164

Realm::Cuda::GPU_MEMCPY_DEVICE_TO_DEVICE
@ GPU_MEMCPY_DEVICE_TO_DEVICE
Definition cuda_internal.h:165

Realm::Cuda::nvmlDeviceGetNvLinkRemoteDeviceType
nvmlReturn_t nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType)

Realm::Cuda::nvmlIntNvLinkDeviceType_enum
nvmlIntNvLinkDeviceType_enum
Definition cuda_internal.h:1538

Realm::Cuda::NVML_NVLINK_DEVICE_TYPE_IBMNPU
@ NVML_NVLINK_DEVICE_TYPE_IBMNPU
Definition cuda_internal.h:1540

Realm::Cuda::NVML_NVLINK_DEVICE_TYPE_SWITCH
@ NVML_NVLINK_DEVICE_TYPE_SWITCH
Definition cuda_internal.h:1541

Realm::Cuda::NVML_NVLINK_DEVICE_TYPE_UNKNOWN
@ NVML_NVLINK_DEVICE_TYPE_UNKNOWN
Definition cuda_internal.h:1542

Realm::Cuda::NVML_NVLINK_DEVICE_TYPE_GPU
@ NVML_NVLINK_DEVICE_TYPE_GPU
Definition cuda_internal.h:1539

Realm::Cuda::nvmlIntNvLinkDeviceType_t
enum Realm::Cuda::nvmlIntNvLinkDeviceType_enum nvmlIntNvLinkDeviceType_t

Realm
Definition activemsg.h:42

Realm::NodeID
int NodeID
Definition nodeset.h:40

Realm::XferDesKind
XferDesKind
Definition channel.h:84

Realm::CustomSerdezID
int CustomSerdezID
Definition custom_serdez.h:148

Realm::OsHandle
int OsHandle
Definition utils.h:399

Realm::XferDesID
unsigned long long XferDesID
Definition channel.h:56

Realm::ReductionOpID
::realm_reduction_op_id_t ReductionOpID
Definition event.h:38

operation.h

proc_impl.h

REALM_MAX_DIM
#define REALM_MAX_DIM
Definition realm_config.h:34

Realm::ChannelCopyInfo
Definition channel.h:683

Realm::Cuda::AffineCopyInfo
Definition cuda_memcpy.h:102

Realm::Cuda::CudaIpcImportRequest
Definition cuda_internal.h:1125

Realm::Cuda::CudaIpcImportRequest::handle_message
static void handle_message(NodeID sender, const CudaIpcImportRequest &args, const void *data, size_t datalen)

Realm::Cuda::CudaIpcImportRequest::count
unsigned count
Definition cuda_internal.h:1126

Realm::Cuda::CudaIpcImportRequest::hostid
long hostid
Definition cuda_internal.h:1128

Realm::Cuda::FatBin
Definition cudart_hijack.h:53

Realm::Cuda::GPUInfo
Definition cuda_internal.h:127

Realm::Cuda::GPUInfo::pci_busid
int pci_busid
Definition cuda_internal.h:141

Realm::Cuda::GPUInfo::device
CUdevice device
Definition cuda_internal.h:129

Realm::Cuda::GPUInfo::pci_bandwidth
size_t pci_bandwidth
Definition cuda_internal.h:145

Realm::Cuda::GPUInfo::logical_peer_bandwidth
std::vector< size_t > logical_peer_bandwidth
Definition cuda_internal.h:148

Realm::Cuda::GPUInfo::pci_domainid
int pci_domainid
Definition cuda_internal.h:142

Realm::Cuda::GPUInfo::uuid
CUuuid uuid
Definition cuda_internal.h:131

Realm::Cuda::GPUInfo::peers
std::set< CUdevice > peers
Definition cuda_internal.h:140

Realm::Cuda::GPUInfo::has_numa_preference
bool has_numa_preference
Definition cuda_internal.h:138

Realm::Cuda::GPUInfo::pageable_access_supported
bool pageable_access_supported
Definition cuda_internal.h:154

Realm::Cuda::GPUInfo::logical_peer_latency
std::vector< size_t > logical_peer_latency
Definition cuda_internal.h:149

Realm::Cuda::GPUInfo::host_gpu_same_va
bool host_gpu_same_va
Definition cuda_internal.h:147

Realm::Cuda::GPUInfo::fabric_clique
unsigned fabric_clique
Definition cuda_internal.h:152

Realm::Cuda::GPUInfo::fabric_supported
bool fabric_supported
Definition cuda_internal.h:151

Realm::Cuda::GPUInfo::name
char name[MAX_NAME_LEN]
Definition cuda_internal.h:135

Realm::Cuda::GPUInfo::major
int major
Definition cuda_internal.h:132

Realm::Cuda::GPUInfo::totalGlobalMem
size_t totalGlobalMem
Definition cuda_internal.h:136

Realm::Cuda::GPUInfo::pci_deviceid
int pci_deviceid
Definition cuda_internal.h:143

Realm::Cuda::GPUInfo::nvml_dev
nvmlDevice_t nvml_dev
Definition cuda_internal.h:130

Realm::Cuda::GPUInfo::numa_node_affinity
unsigned long numa_node_affinity[MAX_NUMA_NODE_LEN]
Definition cuda_internal.h:139

Realm::Cuda::GPUInfo::c2c_bandwidth
size_t c2c_bandwidth
Definition cuda_internal.h:144

Realm::Cuda::GPUInfo::index
int index
Definition cuda_internal.h:128

Realm::Cuda::GPUInfo::minor
int minor
Definition cuda_internal.h:133

Realm::Cuda::GPUInfo::nvswitch_bandwidth
size_t nvswitch_bandwidth
Definition cuda_internal.h:146

Realm::Cuda::GPUInfo::fabric_uuid
CUuuid fabric_uuid
Definition cuda_internal.h:153

Realm::Cuda::GPUInfo::MAX_NAME_LEN
static const size_t MAX_NAME_LEN
Definition cuda_internal.h:134

Realm::Cuda::GPUInfo::MAX_NUMA_NODE_LEN
static const size_t MAX_NUMA_NODE_LEN
Definition cuda_internal.h:137

Realm::Cuda::GPUProcessor::GPUTaskTableEntry
Definition cuda_internal.h:576

Realm::Cuda::GPUProcessor::GPUTaskTableEntry::stream_aware_fnptr
Cuda::StreamAwareTaskFuncPtr stream_aware_fnptr
Definition cuda_internal.h:578

Realm::Cuda::GPUProcessor::GPUTaskTableEntry::fnptr
Processor::TaskFuncPtr fnptr
Definition cuda_internal.h:577

Realm::Cuda::GPUProcessor::GPUTaskTableEntry::user_data
ByteArray user_data
Definition cuda_internal.h:579

Realm::Cuda::GPUStream::PendingEvent
Definition cuda_internal.h:282

Realm::Cuda::GPUStream::PendingEvent::start
GPUWorkStart * start
Definition cuda_internal.h:285

Realm::Cuda::GPUStream::PendingEvent::event
CUevent event
Definition cuda_internal.h:283

Realm::Cuda::GPUStream::PendingEvent::fence
GPUWorkFence * fence
Definition cuda_internal.h:284

Realm::Cuda::GPUStream::PendingEvent::notification
GPUCompletionNotification * notification
Definition cuda_internal.h:286

Realm::Cuda::GPU::CudaIpcMapping
Definition cuda_internal.h:512

Realm::Cuda::GPU::CudaIpcMapping::address_offset
uintptr_t address_offset
Definition cuda_internal.h:517

Realm::Cuda::GPU::CudaIpcMapping::owner
NodeID owner
Definition cuda_internal.h:513

Realm::Cuda::GPU::CudaIpcMapping::src_gpu
GPU * src_gpu
Definition cuda_internal.h:514

Realm::Cuda::GPU::CudaIpcMapping::mem
Memory mem
Definition cuda_internal.h:515

Realm::Cuda::GPU::CudaIpcMapping::local_base
uintptr_t local_base
Definition cuda_internal.h:516

Realm::Cuda::GPU::GPUFuncInfo
Definition cuda_internal.h:457

Realm::Cuda::GPU::GPUFuncInfo::func
CUfunction func
Definition cuda_internal.h:458

Realm::Cuda::GPU::GPUFuncInfo::occ_num_threads
int occ_num_threads
Definition cuda_internal.h:459

Realm::Cuda::GPU::GPUFuncInfo::occ_num_blocks
int occ_num_blocks
Definition cuda_internal.h:460

Realm::Cuda::GPU::GPUReductionOpEntry
Definition cuda_internal.h:524

Realm::Cuda::GPU::GPUReductionOpEntry::fold_excl
CUfunction fold_excl
Definition cuda_internal.h:528

Realm::Cuda::GPU::GPUReductionOpEntry::fold_nonexcl_transpose
CUfunction fold_nonexcl_transpose
Definition cuda_internal.h:535

Realm::Cuda::GPU::GPUReductionOpEntry::fold_excl_transpose
CUfunction fold_excl_transpose
Definition cuda_internal.h:536

Realm::Cuda::GPU::GPUReductionOpEntry::apply_excl_advanced
CUfunction apply_excl_advanced
Definition cuda_internal.h:530

Realm::Cuda::GPU::GPUReductionOpEntry::apply_excl_transpose
CUfunction apply_excl_transpose
Definition cuda_internal.h:534

Realm::Cuda::GPU::GPUReductionOpEntry::apply_nonexcl
CUfunction apply_nonexcl
Definition cuda_internal.h:525

Realm::Cuda::GPU::GPUReductionOpEntry::apply_nonexcl_advanced
CUfunction apply_nonexcl_advanced
Definition cuda_internal.h:529

Realm::Cuda::GPU::GPUReductionOpEntry::fold_nonexcl
CUfunction fold_nonexcl
Definition cuda_internal.h:527

Realm::Cuda::GPU::GPUReductionOpEntry::apply_excl
CUfunction apply_excl
Definition cuda_internal.h:526

Realm::Cuda::GPU::GPUReductionOpEntry::fold_nonexcl_advanced
CUfunction fold_nonexcl_advanced
Definition cuda_internal.h:531

Realm::Cuda::GPU::GPUReductionOpEntry::fold_excl_advanced
CUfunction fold_excl_advanced
Definition cuda_internal.h:532

Realm::Cuda::GPU::GPUReductionOpEntry::apply_nonexcl_transpose
CUfunction apply_nonexcl_transpose
Definition cuda_internal.h:533

Realm::Cuda::KernelVariantDesc
Definition cuda_internal.h:1028

Realm::Cuda::KernelVariantDesc::host_proxy
void * host_proxy
Definition cuda_internal.h:1029

Realm::Cuda::KernelVariantDesc::cache_field
CUfunction GPU::GPUReductionOpEntry::* cache_field
Definition cuda_internal.h:1030

Realm::Cuda::MemcpyTransposeInfo
Definition cuda_memcpy.h:114

Realm::Cuda::RegisteredFunction
Definition cudart_hijack.h:65

Realm::Cuda::RegisteredVariable
Definition cudart_hijack.h:76

Realm::IntrusiveListLink
Definition lists.h:53

Realm::ReductionOpUntyped
Definition redop.h:56

Realm::XferDesRedopInfo
Definition channel.h:209

Realm::XferDes::XferPort
Definition channel.h:299

threads.h

src
NodeID src
Definition ucp_internal.h:1