doc/main/hip__internal_8h_source.html

/*

 * Copyright 2026 Stanford University, NVIDIA Corporation, Los Alamos National Laboratory

 * SPDX-License-Identifier: Apache-2.0

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


#ifndef REALM_HIP_INTERNAL_H

#define REALM_HIP_INTERNAL_H


#include "realm/hip/hip_module.h"


#include <hip/hip_runtime.h>


#include "realm/operation.h"

#include "realm/threads.h"

#include "realm/circ_queue.h"

#include "realm/indexspace.h"

#include "realm/proc_impl.h"

#include "realm/mem_impl.h"

#include "realm/bgwork.h"

#include "realm/transfer/channel.h"

#include "realm/transfer/ib_memory.h"


#define CHECK_CUDART(cmd)                                                                \

  do {                                                                                   \

    hipError_t ret = (cmd);                                                              \

    if(ret != hipSuccess) {                                                              \

      fprintf(stderr, "HIP: %s = %d (%s)\n", #cmd, ret, hipGetErrorString(ret));         \

      assert(0);                                                                         \

      exit(1);                                                                           \

    }                                                                                    \

  } while(0)


#define REPORT_HIP_ERROR(cmd, ret)                                                       \

  do {                                                                                   \

    const char *name, *str;                                                              \

    name = hipGetErrorName(ret);                                                         \

    str = hipGetErrorString(ret);                                                        \

    fprintf(stderr, "HIP: %s = %d (%s): %s\n", cmd, ret, name, str);                     \

    abort();                                                                             \

  } while(0)


#define CHECK_HIP(cmd)                                                                   \

  do {                                                                                   \

    hipError_t ret = (cmd);                                                              \

    if(ret != hipSuccess)                                                                \

      REPORT_HIP_ERROR(#cmd, ret);                                                       \

  } while(0)


namespace Realm {


  namespace Hip {


    struct GPUInfo

#ifdef REALM_USE_HIP_HIJACK

      : public hipDeviceProp_t

#endif

    {

      int index; // index used by HIP runtime

      hipDevice_t device;


      static const size_t MAX_NAME_LEN = 64;

#ifndef REALM_USE_HIP_HIJACK

      char name[MAX_NAME_LEN];


      int major, minor;

      size_t totalGlobalMem;

#endif

      std::set<hipDevice_t> peers; // other GPUs we can do p2p copies with

    };


    enum GPUMemcpyKind

    {

      GPU_MEMCPY_HOST_TO_DEVICE,

      GPU_MEMCPY_DEVICE_TO_HOST,

      GPU_MEMCPY_DEVICE_TO_DEVICE,

      GPU_MEMCPY_PEER_TO_PEER,

    };


    // Forard declaration

    class GPUProcessor;

    class GPUWorker;

    class GPUStream;

    class GPUFBMemory;

    class GPUDynamicFBMemory;

    class GPUZCMemory;

    class GPUFBIBMemory;

    class GPU;

    class HipModule;


    extern HipModule *hip_module_singleton;


    // an interface for receiving completion notification for a GPU operation

    //  (right now, just copies)


    class GPUCompletionNotification {

    public:

      virtual ~GPUCompletionNotification(void) {}


      virtual void request_completed(void) = 0;

    };


    class GPUPreemptionWaiter : public GPUCompletionNotification {

    public:

      GPUPreemptionWaiter(GPU *gpu);

      virtual ~GPUPreemptionWaiter(void) {}


    public:

      virtual void request_completed(void);


    public:

      void preempt(void);


    private:

      GPU *const gpu;

      Event wait_event;

    };


    class GPUWorkFence : public Realm::Operation::AsyncWorkItem {

    public:

      GPUWorkFence(Realm::Operation *op);


      virtual void mark_finished(bool successful);


      virtual void request_cancellation(void);


      void enqueue_on_stream(GPUStream *stream);


      virtual void print(std::ostream &os) const;


      IntrusiveListLink<GPUWorkFence> fence_list_link;

      REALM_PMTA_DEFN(GPUWorkFence, IntrusiveListLink<GPUWorkFence>, fence_list_link);

      typedef IntrusiveList<GPUWorkFence, REALM_PMTA_USE(GPUWorkFence, fence_list_link),

                            DummyLock>

          FenceList;


    protected:

      static void cuda_callback(hipStream_t stream, hipError_t res, void *data);

    };


    class GPUWorkStart : public Realm::Operation::AsyncWorkItem {

    public:

      GPUWorkStart(Realm::Operation *op);


      virtual void request_cancellation(void) { return; };


      void enqueue_on_stream(GPUStream *stream);


      virtual void print(std::ostream &os) const;


      void mark_gpu_work_start();


    protected:

      static void cuda_start_callback(hipStream_t stream, hipError_t res, void *data);

    };


    // a class that represents a HIP stream and work associated with

    //  it (e.g. queued copies, events in flight)

    // a stream is also associated with a GPUWorker that it will register

    //  with when async work needs doing


    class GPUStream {

    public:

      GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority = 0);

      ~GPUStream(void);


      GPU *get_gpu(void) const;

      REALM_INTERNAL_API_EXTERNAL_LINKAGE hipStream_t

      get_stream(void) const; // needed by librealm_kokkos.so


      // may be called by anybody to enqueue a copy or an event

      void add_fence(GPUWorkFence *fence);

      void add_start_event(GPUWorkStart *start);

      void add_notification(GPUCompletionNotification *notification);

      void wait_on_streams(const std::set<GPUStream *> &other_streams);


      // atomically checks rate limit counters and returns true if 'bytes'

      //  worth of copies can be submitted or false if not (in which case

      //  the progress counter on the xd will be updated when it should try

      //  again)

      bool ok_to_submit_copy(size_t bytes, XferDes *xd);


      // to be called by a worker (that should already have the GPU context

      //   current) - returns true if any work remains

      bool reap_events(TimeLimit work_until);


    protected:

      // may only be tested with lock held

      bool has_work(void) const;


      void add_event(hipEvent_t event, GPUWorkFence *fence,

                     GPUCompletionNotification *notification = NULL,

                     GPUWorkStart *start = NULL);


      GPU *gpu;

      GPUWorker *worker;


      hipStream_t stream;


      Mutex mutex;


      struct PendingEvent {

        hipEvent_t event;

        GPUWorkFence *fence;

        GPUWorkStart *start;

        GPUCompletionNotification *notification;

      };


#ifdef USE_CQ

      Realm::CircularQueue<PendingEvent> pending_events;

#else

      std::deque<PendingEvent> pending_events;

#endif

    };


    // a GPUWorker is responsible for making progress on one or more GPUStreams -

    //  this may be done directly by a GPUProcessor or in a background thread

    //  spawned for the purpose


    class GPUWorker : public BackgroundWorkItem {

    public:

      GPUWorker(void);

      virtual ~GPUWorker(void);


      // adds a stream that has work to be done

      void add_stream(GPUStream *s);


      // used to start a dedicate thread (mutually exclusive with being

      //  registered with a background work manager)

      void start_background_thread(Realm::CoreReservationSet &crs, size_t stack_size);

      void shutdown_background_thread(void);


      bool do_work(TimeLimit work_until);


    public:

      void thread_main(void);


    protected:

      // used by the background thread

      // processes work on streams, optionally sleeping for work to show up

      // returns true if work remains to be done

      bool process_streams(bool sleep_on_empty);


      Mutex lock;

      Mutex::CondVar condvar;


      typedef CircularQueue<GPUStream *, 16> ActiveStreamQueue;

      ActiveStreamQueue active_streams;


      // used by the background thread (if any)

      Realm::CoreReservation *core_rsrv;

      Realm::Thread *worker_thread;

      bool thread_sleeping;

      atomic<bool> worker_shutdown_requested;

    };


    // a little helper class to manage a pool of CUevents that can be reused

    //  to reduce alloc/destroy overheads


    class GPUEventPool {

    public:

      GPUEventPool(int _batch_size = 256);


      // allocating the initial batch of events and cleaning up are done with

      //  these methods instead of constructor/destructor because we don't

      //  manage the GPU context in this helper class

      void init_pool(int init_size = 0 /* default == batch size */);

      void empty_pool(void);


      hipEvent_t get_event(bool external = false);

      void return_event(hipEvent_t e, bool external = false);


    protected:

      Mutex mutex;

      int batch_size, current_size, total_size, external_count;

      std::vector<hipEvent_t> available_events;

    };


    // when the runtime hijack is not enabled/active, a cuCtxSynchronize

    //  is required to ensure a task's completion event covers all of its

    //  actions - rather than blocking an important thread, we create a

    //  small thread pool to handle these


    class ContextSynchronizer {

    public:

      ContextSynchronizer(GPU *_gpu, int _device_id, CoreReservationSet &crs,

                          int _max_threads);

      ~ContextSynchronizer();


      void add_fence(GPUWorkFence *fence);


      void shutdown_threads();


      void thread_main();


    protected:

      GPU *gpu;

      // hipCtx_t context;

      int device_id;

      int max_threads;

      Mutex mutex;

      Mutex::CondVar condvar;

      bool shutdown_flag;

      GPUWorkFence::FenceList fences;

      int total_threads, sleeping_threads, syncing_threads;

      std::vector<Thread *> worker_threads;

      CoreReservation *core_rsrv;

    };


    struct FatBin;

    struct RegisteredVariable;

    struct RegisteredFunction;


    // a GPU object represents our use of a given HIP-capable GPU - this will

    //  have an associated HIP context, a (possibly shared) worker thread, a

    //  processor, and an FB memory (the ZC memory is shared across all GPUs)


    class GPU {

    public:

      GPU(HipModule *_module, GPUInfo *_info, GPUWorker *worker, int _device_id);

      ~GPU(void);


      void push_context(void);

      void pop_context(void);


#ifdef REALM_USE_HIP_HIJACK

      void register_fat_binary(const FatBin *data);

      void register_variable(const RegisteredVariable *var);

      void register_function(const RegisteredFunction *func);


      hipFunction_t lookup_function(const void *func);

      char *lookup_variable(const void *var);

#endif


      void create_processor(RuntimeImpl *runtime, size_t stack_size);

      void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size);

      void create_dynamic_fb_memory(RuntimeImpl *runtime, size_t max_size);


      void create_dma_channels(Realm::RuntimeImpl *r);


      bool can_access_peer(GPU *peer);


      GPUStream *find_stream(hipStream_t stream) const;

      REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUStream *

      get_null_task_stream(void) const; // needed by librealm_kokkos.so

      GPUStream *get_next_task_stream(bool create = false);

      GPUStream *get_next_d2d_stream();


    protected:

      hipModule_t load_hip_module(const void *data);


    public:

      HipModule *module = nullptr;

      GPUInfo *info = nullptr;

      GPUWorker *worker = nullptr;

      GPUProcessor *proc = nullptr;

      GPUFBMemory *fbmem = nullptr;

      GPUDynamicFBMemory *fb_dmem = nullptr;

      GPUFBIBMemory *fb_ibmem = nullptr;


      // hipCtx_t context;

      int device_id = -1;


      char *fbmem_base = nullptr;


      char *fb_ibmem_base = nullptr;


      // which system memories have been registered and can be used for cuMemcpyAsync

      std::set<Memory> pinned_sysmems;


      // managed memories we can concurrently access

      std::set<Memory> managed_mems;


      // which other FBs we have peer access to

      std::set<Memory> peer_fbs;


      // streams for different copy types and a pile for actual tasks

      GPUStream *host_to_device_stream = nullptr;

      GPUStream *device_to_host_stream = nullptr;

      GPUStream *device_to_device_stream = nullptr;

      std::vector<GPUStream *> device_to_device_streams;

      std::vector<GPUStream *> peer_to_peer_streams; // indexed by target

      std::vector<GPUStream *> task_streams;

      atomic<unsigned> next_task_stream = atomic<unsigned>(0);

      atomic<unsigned> next_d2d_stream = atomic<unsigned>(0);


      GPUEventPool event_pool;


      // this can technically be different in each context (but probably isn't

      //  in practice)

      int least_stream_priority, greatest_stream_priority;


      struct HipIpcMapping {

        NodeID owner;

        Memory mem;

        uintptr_t local_base;

        uintptr_t address_offset; // add to convert from original to local base

      };


      std::vector<HipIpcMapping> hipipc_mappings;

      std::map<NodeID, GPUStream *> hipipc_streams;


      const HipIpcMapping *find_ipc_mapping(Memory mem) const;


#ifdef REALM_USE_HIP_HIJACK

      std::map<const FatBin *, hipModule_t> device_modules;

      std::map<const void *, hipFunction_t> device_functions;

      std::map<const void *, char *> device_variables;

#endif

    };


    // helper to push/pop a GPU's context by scope


    class AutoGPUContext {

    public:

      AutoGPUContext(GPU &_gpu);

      AutoGPUContext(GPU *_gpu);

      ~AutoGPUContext(void);


    protected:

      GPU *gpu;

    };


    class REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUProcessor // needed by librealm_kokkos.so

      : public Realm::LocalTaskProcessor {

    public:

      GPUProcessor(RuntimeImpl *runtime_impl, GPU *_gpu, Processor _me,

                   Realm::CoreReservationSet &crs, size_t _stack_size);

      virtual ~GPUProcessor(void);


    public:

      virtual bool register_task(Processor::TaskFuncID func_id, CodeDescriptor &codedesc,

                                 const ByteArrayRef &user_data);


      virtual void shutdown(void);


    protected:

      virtual void execute_task(Processor::TaskFuncID func_id,

                                const ByteArrayRef &task_args);


    public:

      static GPUProcessor *get_current_gpu_proc(void);


#ifdef REALM_USE_HIP_HIJACK

      // calls that come from the HIP runtime API

      void push_call_configuration(dim3 grid_dim, dim3 block_dim, size_t shared_size,

                                   void *stream);

      void pop_call_configuration(dim3 *grid_dim, dim3 *block_dim, size_t *shared_size,

                                  void *stream);

#endif


      void stream_wait_on_event(hipStream_t stream, hipEvent_t event);

      void stream_synchronize(hipStream_t stream);

      void device_synchronize(void);


#ifdef REALM_USE_HIP_HIJACK

      void event_create(hipEvent_t *event, int flags);

      void event_destroy(hipEvent_t event);

      void event_record(hipEvent_t event, hipStream_t stream);

      void event_synchronize(hipEvent_t event);

      void event_elapsed_time(float *ms, hipEvent_t start, hipEvent_t end);


      void configure_call(dim3 grid_dim, dim3 block_dim, size_t shared_memory,

                          hipStream_t stream);

      void setup_argument(const void *arg, size_t size, size_t offset);

      void launch(const void *func);

      void launch_kernel(const void *func, dim3 grid_dim, dim3 block_dim, void **args,

                         size_t shared_memory, hipStream_t stream);

#endif


      void gpu_memcpy(void *dst, const void *src, size_t size, hipMemcpyKind kind);

      void gpu_memcpy_async(void *dst, const void *src, size_t size, hipMemcpyKind kind,

                            hipStream_t stream);

#ifdef REALM_USE_HIP_HIJACK

      void gpu_memcpy_to_symbol(const void *dst, const void *src, size_t size,

                                size_t offset, hipMemcpyKind kind);

      void gpu_memcpy_to_symbol_async(const void *dst, const void *src, size_t size,

                                      size_t offset, hipMemcpyKind kind,

                                      hipStream_t stream);

      void gpu_memcpy_from_symbol(void *dst, const void *src, size_t size, size_t offset,

                                  hipMemcpyKind kind);

      void gpu_memcpy_from_symbol_async(void *dst, const void *src, size_t size,

                                        size_t offset, hipMemcpyKind kind,

                                        hipStream_t stream);

#endif


      void gpu_memset(void *dst, int value, size_t count);

      void gpu_memset_async(void *dst, int value, size_t count, hipStream_t stream);


    public:

      GPU *gpu;


      // data needed for kernel launches


      struct LaunchConfig {

        dim3 grid;

        dim3 block;

        size_t shared;

        LaunchConfig(dim3 _grid, dim3 _block, size_t _shared);

      };


      struct CallConfig : public LaunchConfig {

        hipStream_t stream;

        CallConfig(dim3 _grid, dim3 _block, size_t _shared, hipStream_t _stream);

      };


      std::vector<CallConfig> launch_configs;

      std::vector<char> kernel_args;

      std::vector<CallConfig> call_configs;

      bool block_on_synchronize;

      ContextSynchronizer ctxsync;


    protected:

      Realm::CoreReservation *core_rsrv;


      struct GPUTaskTableEntry {

        Processor::TaskFuncPtr fnptr;

        Hip::StreamAwareTaskFuncPtr stream_aware_fnptr;

        ByteArray user_data;

      };


      // we're not using the parent's task table, but we can use the mutex

      // RWLock task_table_mutex;

      std::map<Processor::TaskFuncID, GPUTaskTableEntry> gpu_task_table;

    };


    // this can be attached to any MemoryImpl if the underlying memory is

    //  guaranteed to belong to a given device - this will allow that

    //  context's processor and dma channels to work with it

    // the creator is expected to know what device they want but need

    //  not know which GPU object that corresponds to


    class HipDeviceMemoryInfo : public ModuleSpecificInfo {

    public:

      HipDeviceMemoryInfo(int _device_id);


      int device_id;

      GPU *gpu;

    };


    class GPUFBMemory : public LocalManagedMemory {

    public:

      GPUFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, char *_base,

                  size_t _size);


      virtual ~GPUFBMemory(void);


      // these work, but they are SLOW

      virtual void get_bytes(off_t offset, void *dst, size_t size);

      virtual void put_bytes(off_t offset, const void *src, size_t size);


      virtual void *get_direct_ptr(off_t offset, size_t size);


      // GPUFBMemory supports ExternalHipMemoryResource and

      //  ExternalHipArrayResource (not implemented)

      virtual bool attempt_register_external_resource(RegionInstanceImpl *inst,

                                                      size_t &inst_offset);

      virtual void unregister_external_resource(RegionInstanceImpl *inst);


      // for re-registration purposes, generate an ExternalInstanceResource *

      //  (if possible) for a given instance, or a subset of one

      virtual ExternalInstanceResource *

      generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace,

                             span<const FieldID> fields, bool read_only);


    public:

      GPU *gpu;

      char *base;

      NetworkSegment local_segment;

    };


    class GPUDynamicFBMemory : public MemoryImpl {

    public:

      GPUDynamicFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu,

                         size_t _max_size);


      virtual ~GPUDynamicFBMemory(void);

      void cleanup(void);


      // deferred allocation not supported

      virtual AllocationResult allocate_storage_immediate(RegionInstanceImpl *inst,

                                                          bool need_alloc_result,

                                                          bool poisoned,

                                                          TimeLimit work_until);


      virtual void release_storage_immediate(RegionInstanceImpl *inst, bool poisoned,

                                             TimeLimit work_until);


      // these work, but they are SLOW

      virtual void get_bytes(off_t offset, void *dst, size_t size);

      virtual void put_bytes(off_t offset, const void *src, size_t size);


      virtual void *get_direct_ptr(off_t offset, size_t size);


      // GPUDynamicFBMemory supports ExternalHipMemoryResource and

      //  ExternalHipArrayResource (not implemented)

      virtual bool attempt_register_external_resource(RegionInstanceImpl *inst,

                                                      size_t &inst_offset);

      virtual void unregister_external_resource(RegionInstanceImpl *inst);


      // for re-registration purposes, generate an ExternalInstanceResource *

      //  (if possible) for a given instance, or a subset of one

      virtual ExternalInstanceResource *

      generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace,

                             span<const FieldID> fields, bool read_only);


    public:

      GPU *gpu;

      Mutex mutex;

      size_t cur_size;

      std::map<RegionInstance, std::pair<void *, size_t>> alloc_bases;

    };


    class GPUZCMemory : public LocalManagedMemory {

    public:

      GPUZCMemory(RuntimeImpl *_runtime_impl, Memory _me, char *_gpu_base,

                  void *_cpu_base, size_t _size, MemoryKind _kind,

                  Memory::Kind _lowlevel_kind);


      virtual ~GPUZCMemory(void);


      virtual void get_bytes(off_t offset, void *dst, size_t size);


      virtual void put_bytes(off_t offset, const void *src, size_t size);


      virtual void *get_direct_ptr(off_t offset, size_t size);


      // GPUZCMemory supports ExternalHipPinnedHostResource

      virtual bool attempt_register_external_resource(RegionInstanceImpl *inst,

                                                      size_t &inst_offset);

      virtual void unregister_external_resource(RegionInstanceImpl *inst);


      // for re-registration purposes, generate an ExternalInstanceResource *

      //  (if possible) for a given instance, or a subset of one

      virtual ExternalInstanceResource *

      generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace,

                             span<const FieldID> fields, bool read_only);


    public:

      char *gpu_base;

      char *cpu_base;

      NetworkSegment local_segment;

    };


    class GPUFBIBMemory : public IBMemory {

    public:

      GPUFBIBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, char *_base,

                    size_t _size);


    public:

      GPU *gpu;

      char *base;

      NetworkSegment local_segment;

    };


    class GPURequest;


    class GPUCompletionEvent : public GPUCompletionNotification {

    public:

      void request_completed(void);


      GPURequest *req;

    };


    class GPURequest : public Request {

    public:

      const void *src_base;

      void *dst_base;

      // off_t src_gpu_off, dst_gpu_off;

      GPU *dst_gpu;

      GPUCompletionEvent event;

    };


    class GPUTransferCompletion : public GPUCompletionNotification {

    public:

      GPUTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset,

                            size_t _read_size, int _write_port_idx, size_t _write_offset,

                            size_t _write_size);


      virtual void request_completed(void);


    protected:

      XferDes *xd;

      int read_port_idx;

      size_t read_offset, read_size;

      int write_port_idx;

      size_t write_offset, write_size;

    };


    class GPUChannel;


    class GPUXferDes : public XferDes {

    public:

      GPUXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,

                 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,

                 const std::vector<XferDesPortInfo> &outputs_info, int _priority);


      long get_requests(Request **requests, long nr);


      bool progress_xd(GPUChannel *channel, TimeLimit work_until);


    private:

      std::vector<GPU *> src_gpus, dst_gpus;

      std::vector<bool> dst_is_ipc;

    };


    class GPUChannel : public SingleXDQChannel<GPUChannel, GPUXferDes> {

    public:

      GPUChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork);

      ~GPUChannel();


      // multi-threading of cuda copies for a given device is disabled by

      //  default (can be re-enabled with -cuda:mtdma 1)

      static const bool is_ordered = true;


      virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,

                                       XferDesID guid,

                                       const std::vector<XferDesPortInfo> &inputs_info,

                                       const std::vector<XferDesPortInfo> &outputs_info,

                                       int priority, XferDesRedopInfo redop_info,

                                       const void *fill_data, size_t fill_size,

                                       size_t fill_total);


      long submit(Request **requests, long nr);


    private:

      GPU *src_gpu;

      // std::deque<Request*> pending_copies;

    };


    class GPUfillChannel;


    class GPUfillXferDes : public XferDes {

    public:

      GPUfillXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,

                     XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,

                     const std::vector<XferDesPortInfo> &outputs_info, int _priority,

                     const void *_fill_data, size_t _fill_size, size_t _fill_total);


      long get_requests(Request **requests, long nr);


      bool progress_xd(GPUfillChannel *channel, TimeLimit work_until);


    protected:

      size_t reduced_fill_size;

    };


    class GPUfillChannel : public SingleXDQChannel<GPUfillChannel, GPUfillXferDes> {

    public:

      GPUfillChannel(GPU *_gpu, BackgroundWorkManager *bgwork);


      // multiple concurrent cuda fills ok

      static const bool is_ordered = false;


      virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,

                                       XferDesID guid,

                                       const std::vector<XferDesPortInfo> &inputs_info,

                                       const std::vector<XferDesPortInfo> &outputs_info,

                                       int priority, XferDesRedopInfo redop_info,

                                       const void *fill_data, size_t fill_size,

                                       size_t fill_total);


      long submit(Request **requests, long nr);


    protected:

      friend class GPUfillXferDes;


      GPU *gpu;

    };


    class GPUreduceChannel;


    class GPUreduceXferDes : public XferDes {

    public:

      GPUreduceXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,

                       XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,

                       const std::vector<XferDesPortInfo> &outputs_info, int _priority,

                       XferDesRedopInfo _redop_info);


      long get_requests(Request **requests, long nr);


      bool progress_xd(GPUreduceChannel *channel, TimeLimit work_until);


    protected:

      XferDesRedopInfo redop_info;

      const ReductionOpUntyped *redop;

#if defined(REALM_USE_HIP_HIJACK)

      void *kernel;

#else

      const void *kernel_host_proxy;

#endif

      GPUStream *stream;

    };


    class GPUreduceChannel : public SingleXDQChannel<GPUreduceChannel, GPUreduceXferDes> {

    public:

      GPUreduceChannel(GPU *_gpu, BackgroundWorkManager *bgwork);


      // multiple concurrent cuda reduces ok

      static const bool is_ordered = false;


      virtual bool supports_redop(ReductionOpID redop_id) const;


      virtual RemoteChannelInfo *construct_remote_info() const;


      virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,

                                       XferDesID guid,

                                       const std::vector<XferDesPortInfo> &inputs_info,

                                       const std::vector<XferDesPortInfo> &outputs_info,

                                       int priority, XferDesRedopInfo redop_info,

                                       const void *fill_data, size_t fill_size,

                                       size_t fill_total);


      long submit(Request **requests, long nr);


    protected:

      friend class GPUreduceXferDes;


      GPU *gpu;

    };


    class GPUreduceRemoteChannelInfo : public SimpleRemoteChannelInfo {

    public:

      GPUreduceRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr,

                                 const std::vector<Channel::SupportedPath> &_paths);


      virtual RemoteChannel *create_remote_channel();


      template <typename S>

      bool serialize(S &serializer) const;


      template <typename S>

      static RemoteChannelInfo *deserialize_new(S &deserializer);


    protected:

      static Serialization::PolymorphicSerdezSubclass<RemoteChannelInfo,

                                                      GPUreduceRemoteChannelInfo>

          serdez_subclass;

    };


    class GPUreduceRemoteChannel : public RemoteChannel {

      friend class GPUreduceRemoteChannelInfo;


      GPUreduceRemoteChannel(uintptr_t _remote_ptr);

    };


    // active messages for establishing cuda ipc mappings


    struct HipIpcRequest {

#ifdef REALM_ON_LINUX

      long hostid; // POSIX hostid

#endif


      static void handle_message(NodeID sender, const HipIpcRequest &args,

                                 const void *data, size_t datalen);

    };


    struct HipIpcResponse {

      unsigned count;


      static void handle_message(NodeID sender, const HipIpcResponse &args,

                                 const void *data, size_t datalen);

    };


    struct HipIpcRelease {


      static void handle_message(NodeID sender, const HipIpcRelease &args,

                                 const void *data, size_t datalen);

    };


    class GPUReplHeapListener : public ReplicatedHeap::Listener {

    public:

      GPUReplHeapListener(HipModule *_module);


      virtual void chunk_created(void *base, size_t bytes);

      virtual void chunk_destroyed(void *base, size_t bytes);


    protected:

      HipModule *module;

    };


  }; // namespace Hip


}; // namespace Realm


#endif

bgwork.h

channel.h

circ_queue.h

Realm::BackgroundWorkItem
Definition bgwork.h:129

Realm::BackgroundWorkManager
Definition bgwork.h:36

Realm::ByteArrayRef
Definition bytearray.h:30

Realm::ByteArray
Definition bytearray.h:53

Realm::Channel
Definition channel.h:712

Realm::CircularQueue
Definition circ_queue.h:35

Realm::CodeDescriptor
Definition codedesc.h:249

Realm::CoreReservationSet
Definition threads.h:382

Realm::CoreReservation
Definition threads.h:342

Realm::DummyLock
Definition threads.h:428

Realm::Event
Definition event.h:50

Realm::ExternalInstanceResource
Definition instance.h:405

Realm::Hip::AutoGPUContext
Definition hip_internal.h:416

Realm::Hip::AutoGPUContext::AutoGPUContext
AutoGPUContext(GPU *_gpu)

Realm::Hip::AutoGPUContext::gpu
GPU * gpu
Definition hip_internal.h:423

Realm::Hip::AutoGPUContext::AutoGPUContext
AutoGPUContext(GPU &_gpu)

Realm::Hip::AutoGPUContext::~AutoGPUContext
~AutoGPUContext(void)

Realm::Hip::ContextSynchronizer
Definition hip_internal.h:289

Realm::Hip::ContextSynchronizer::condvar
Mutex::CondVar condvar
Definition hip_internal.h:307

Realm::Hip::ContextSynchronizer::thread_main
void thread_main()

Realm::Hip::ContextSynchronizer::~ContextSynchronizer
~ContextSynchronizer()

Realm::Hip::ContextSynchronizer::shutdown_flag
bool shutdown_flag
Definition hip_internal.h:308

Realm::Hip::ContextSynchronizer::max_threads
int max_threads
Definition hip_internal.h:305

Realm::Hip::ContextSynchronizer::gpu
GPU * gpu
Definition hip_internal.h:302

Realm::Hip::ContextSynchronizer::total_threads
int total_threads
Definition hip_internal.h:310

Realm::Hip::ContextSynchronizer::ContextSynchronizer
ContextSynchronizer(GPU *_gpu, int _device_id, CoreReservationSet &crs, int _max_threads)

Realm::Hip::ContextSynchronizer::fences
GPUWorkFence::FenceList fences
Definition hip_internal.h:309

Realm::Hip::ContextSynchronizer::device_id
int device_id
Definition hip_internal.h:304

Realm::Hip::ContextSynchronizer::core_rsrv
CoreReservation * core_rsrv
Definition hip_internal.h:312

Realm::Hip::ContextSynchronizer::worker_threads
std::vector< Thread * > worker_threads
Definition hip_internal.h:311

Realm::Hip::ContextSynchronizer::sleeping_threads
int sleeping_threads
Definition hip_internal.h:310

Realm::Hip::ContextSynchronizer::syncing_threads
int syncing_threads
Definition hip_internal.h:310

Realm::Hip::ContextSynchronizer::add_fence
void add_fence(GPUWorkFence *fence)

Realm::Hip::ContextSynchronizer::shutdown_threads
void shutdown_threads()

Realm::Hip::ContextSynchronizer::mutex
Mutex mutex
Definition hip_internal.h:306

Realm::Hip::GPUChannel
Definition hip_internal.h:705

Realm::Hip::GPUChannel::submit
long submit(Request **requests, long nr)

Realm::Hip::GPUChannel::GPUChannel
GPUChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork)

Realm::Hip::GPUChannel::is_ordered
static const bool is_ordered
Definition hip_internal.h:712

Realm::Hip::GPUChannel::create_xfer_des
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)

Realm::Hip::GPUChannel::~GPUChannel
~GPUChannel()

Realm::Hip::GPUCompletionEvent
Definition hip_internal.h:656

Realm::Hip::GPUCompletionEvent::request_completed
void request_completed(void)

Realm::Hip::GPUCompletionEvent::req
GPURequest * req
Definition hip_internal.h:660

Realm::Hip::GPUCompletionNotification
Definition hip_internal.h:106

Realm::Hip::GPUCompletionNotification::~GPUCompletionNotification
virtual ~GPUCompletionNotification(void)
Definition hip_internal.h:108

Realm::Hip::GPUCompletionNotification::request_completed
virtual void request_completed(void)=0

Realm::Hip::GPUDynamicFBMemory
Definition hip_internal.h:570

Realm::Hip::GPUDynamicFBMemory::gpu
GPU * gpu
Definition hip_internal.h:606

Realm::Hip::GPUDynamicFBMemory::unregister_external_resource
virtual void unregister_external_resource(RegionInstanceImpl *inst)

Realm::Hip::GPUDynamicFBMemory::alloc_bases
std::map< RegionInstance, std::pair< void *, size_t > > alloc_bases
Definition hip_internal.h:609

Realm::Hip::GPUDynamicFBMemory::cur_size
size_t cur_size
Definition hip_internal.h:608

Realm::Hip::GPUDynamicFBMemory::mutex
Mutex mutex
Definition hip_internal.h:607

Realm::Hip::GPUDynamicFBMemory::allocate_storage_immediate
virtual AllocationResult allocate_storage_immediate(RegionInstanceImpl *inst, bool need_alloc_result, bool poisoned, TimeLimit work_until)

Realm::Hip::GPUDynamicFBMemory::get_bytes
virtual void get_bytes(off_t offset, void *dst, size_t size)

Realm::Hip::GPUDynamicFBMemory::cleanup
void cleanup(void)

Realm::Hip::GPUDynamicFBMemory::~GPUDynamicFBMemory
virtual ~GPUDynamicFBMemory(void)

Realm::Hip::GPUDynamicFBMemory::generate_resource_info
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)

Realm::Hip::GPUDynamicFBMemory::put_bytes
virtual void put_bytes(off_t offset, const void *src, size_t size)

Realm::Hip::GPUDynamicFBMemory::get_direct_ptr
virtual void * get_direct_ptr(off_t offset, size_t size)

Realm::Hip::GPUDynamicFBMemory::attempt_register_external_resource
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)

Realm::Hip::GPUDynamicFBMemory::GPUDynamicFBMemory
GPUDynamicFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, size_t _max_size)

Realm::Hip::GPUDynamicFBMemory::release_storage_immediate
virtual void release_storage_immediate(RegionInstanceImpl *inst, bool poisoned, TimeLimit work_until)

Realm::Hip::GPUEventPool
Definition hip_internal.h:266

Realm::Hip::GPUEventPool::empty_pool
void empty_pool(void)

Realm::Hip::GPUEventPool::external_count
int external_count
Definition hip_internal.h:281

Realm::Hip::GPUEventPool::init_pool
void init_pool(int init_size=0)

Realm::Hip::GPUEventPool::batch_size
int batch_size
Definition hip_internal.h:281

Realm::Hip::GPUEventPool::current_size
int current_size
Definition hip_internal.h:281

Realm::Hip::GPUEventPool::GPUEventPool
GPUEventPool(int _batch_size=256)

Realm::Hip::GPUEventPool::total_size
int total_size
Definition hip_internal.h:281

Realm::Hip::GPUEventPool::get_event
hipEvent_t get_event(bool external=false)

Realm::Hip::GPUEventPool::mutex
Mutex mutex
Definition hip_internal.h:280

Realm::Hip::GPUEventPool::available_events
std::vector< hipEvent_t > available_events
Definition hip_internal.h:282

Realm::Hip::GPUEventPool::return_event
void return_event(hipEvent_t e, bool external=false)

Realm::Hip::GPUFBIBMemory
Definition hip_internal.h:643

Realm::Hip::GPUFBIBMemory::base
char * base
Definition hip_internal.h:650

Realm::Hip::GPUFBIBMemory::gpu
GPU * gpu
Definition hip_internal.h:649

Realm::Hip::GPUFBIBMemory::GPUFBIBMemory
GPUFBIBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, char *_base, size_t _size)

Realm::Hip::GPUFBIBMemory::local_segment
NetworkSegment local_segment
Definition hip_internal.h:651

Realm::Hip::GPUFBMemory
Definition hip_internal.h:539

Realm::Hip::GPUFBMemory::unregister_external_resource
virtual void unregister_external_resource(RegionInstanceImpl *inst)

Realm::Hip::GPUFBMemory::put_bytes
virtual void put_bytes(off_t offset, const void *src, size_t size)

Realm::Hip::GPUFBMemory::generate_resource_info
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)

Realm::Hip::GPUFBMemory::get_bytes
virtual void get_bytes(off_t offset, void *dst, size_t size)

Realm::Hip::GPUFBMemory::local_segment
NetworkSegment local_segment
Definition hip_internal.h:567

Realm::Hip::GPUFBMemory::attempt_register_external_resource
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)

Realm::Hip::GPUFBMemory::get_direct_ptr
virtual void * get_direct_ptr(off_t offset, size_t size)

Realm::Hip::GPUFBMemory::base
char * base
Definition hip_internal.h:566

Realm::Hip::GPUFBMemory::~GPUFBMemory
virtual ~GPUFBMemory(void)

Realm::Hip::GPUFBMemory::gpu
GPU * gpu
Definition hip_internal.h:565

Realm::Hip::GPUFBMemory::GPUFBMemory
GPUFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, char *_base, size_t _size)

Realm::Hip::GPUPreemptionWaiter
Definition hip_internal.h:113

Realm::Hip::GPUPreemptionWaiter::GPUPreemptionWaiter
GPUPreemptionWaiter(GPU *gpu)

Realm::Hip::GPUPreemptionWaiter::~GPUPreemptionWaiter
virtual ~GPUPreemptionWaiter(void)
Definition hip_internal.h:116

Realm::Hip::GPUPreemptionWaiter::request_completed
virtual void request_completed(void)

Realm::Hip::GPUPreemptionWaiter::preempt
void preempt(void)

Realm::Hip::GPUProcessor
Definition hip_internal.h:427

Realm::Hip::GPUProcessor::ctxsync
ContextSynchronizer ctxsync
Definition hip_internal.h:510

Realm::Hip::GPUProcessor::gpu_memcpy_async
void gpu_memcpy_async(void *dst, const void *src, size_t size, hipMemcpyKind kind, hipStream_t stream)

Realm::Hip::GPUProcessor::gpu_memset
void gpu_memset(void *dst, int value, size_t count)

Realm::Hip::GPUProcessor::launch_configs
std::vector< CallConfig > launch_configs
Definition hip_internal.h:506

Realm::Hip::GPUProcessor::shutdown
virtual void shutdown(void)

Realm::Hip::GPUProcessor::gpu_memset_async
void gpu_memset_async(void *dst, int value, size_t count, hipStream_t stream)

Realm::Hip::GPUProcessor::register_task
virtual bool register_task(Processor::TaskFuncID func_id, CodeDescriptor &codedesc, const ByteArrayRef &user_data)

Realm::Hip::GPUProcessor::GPUProcessor
GPUProcessor(RuntimeImpl *runtime_impl, GPU *_gpu, Processor _me, Realm::CoreReservationSet &crs, size_t _stack_size)

Realm::Hip::GPUProcessor::gpu_memcpy
void gpu_memcpy(void *dst, const void *src, size_t size, hipMemcpyKind kind)

Realm::Hip::GPUProcessor::call_configs
std::vector< CallConfig > call_configs
Definition hip_internal.h:508

Realm::Hip::GPUProcessor::get_current_gpu_proc
static GPUProcessor * get_current_gpu_proc(void)

Realm::Hip::GPUProcessor::kernel_args
std::vector< char > kernel_args
Definition hip_internal.h:507

Realm::Hip::GPUProcessor::~GPUProcessor
virtual ~GPUProcessor(void)

Realm::Hip::GPUProcessor::stream_synchronize
void stream_synchronize(hipStream_t stream)

Realm::Hip::GPUProcessor::execute_task
virtual void execute_task(Processor::TaskFuncID func_id, const ByteArrayRef &task_args)

Realm::Hip::GPUProcessor::block_on_synchronize
bool block_on_synchronize
Definition hip_internal.h:509

Realm::Hip::GPUProcessor::stream_wait_on_event
void stream_wait_on_event(hipStream_t stream, hipEvent_t event)

Realm::Hip::GPUProcessor::gpu
GPU * gpu
Definition hip_internal.h:493

Realm::Hip::GPUProcessor::device_synchronize
void device_synchronize(void)

Realm::Hip::GPUProcessor::gpu_task_table
std::map< Processor::TaskFuncID, GPUTaskTableEntry > gpu_task_table
Definition hip_internal.h:523

Realm::Hip::GPUProcessor::core_rsrv
Realm::CoreReservation * core_rsrv
Definition hip_internal.h:513

Realm::Hip::GPUReplHeapListener
Definition hip_internal.h:869

Realm::Hip::GPUReplHeapListener::chunk_destroyed
virtual void chunk_destroyed(void *base, size_t bytes)

Realm::Hip::GPUReplHeapListener::chunk_created
virtual void chunk_created(void *base, size_t bytes)

Realm::Hip::GPUReplHeapListener::GPUReplHeapListener
GPUReplHeapListener(HipModule *_module)

Realm::Hip::GPURequest
Definition hip_internal.h:663

Realm::Hip::GPURequest::dst_base
void * dst_base
Definition hip_internal.h:666

Realm::Hip::GPURequest::src_base
const void * src_base
Definition hip_internal.h:665

Realm::Hip::GPURequest::event
GPUCompletionEvent event
Definition hip_internal.h:669

Realm::Hip::GPURequest::dst_gpu
GPU * dst_gpu
Definition hip_internal.h:668

Realm::Hip::GPUStream
Definition hip_internal.h:171

Realm::Hip::GPUStream::add_fence
void add_fence(GPUWorkFence *fence)

Realm::Hip::GPUStream::add_start_event
void add_start_event(GPUWorkStart *start)

Realm::Hip::GPUStream::worker
GPUWorker * worker
Definition hip_internal.h:205

Realm::Hip::GPUStream::pending_events
std::deque< PendingEvent > pending_events
Definition hip_internal.h:220

Realm::Hip::GPUStream::add_event
void add_event(hipEvent_t event, GPUWorkFence *fence, GPUCompletionNotification *notification=NULL, GPUWorkStart *start=NULL)

Realm::Hip::GPUStream::has_work
bool has_work(void) const

Realm::Hip::GPUStream::gpu
GPU * gpu
Definition hip_internal.h:204

Realm::Hip::GPUStream::add_notification
void add_notification(GPUCompletionNotification *notification)

Realm::Hip::GPUStream::reap_events
bool reap_events(TimeLimit work_until)

Realm::Hip::GPUStream::stream
hipStream_t stream
Definition hip_internal.h:207

Realm::Hip::GPUStream::mutex
Mutex mutex
Definition hip_internal.h:209

Realm::Hip::GPUStream::wait_on_streams
void wait_on_streams(const std::set< GPUStream * > &other_streams)

Realm::Hip::GPUStream::GPUStream
GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority=0)

Realm::Hip::GPUStream::get_stream
REALM_INTERNAL_API_EXTERNAL_LINKAGE hipStream_t get_stream(void) const

Realm::Hip::GPUStream::ok_to_submit_copy
bool ok_to_submit_copy(size_t bytes, XferDes *xd)

Realm::Hip::GPUStream::~GPUStream
~GPUStream(void)

Realm::Hip::GPUStream::get_gpu
GPU * get_gpu(void) const

Realm::Hip::GPUTransferCompletion
Definition hip_internal.h:672

Realm::Hip::GPUTransferCompletion::write_port_idx
int write_port_idx
Definition hip_internal.h:684

Realm::Hip::GPUTransferCompletion::read_offset
size_t read_offset
Definition hip_internal.h:683

Realm::Hip::GPUTransferCompletion::write_offset
size_t write_offset
Definition hip_internal.h:685

Realm::Hip::GPUTransferCompletion::xd
XferDes * xd
Definition hip_internal.h:681

Realm::Hip::GPUTransferCompletion::request_completed
virtual void request_completed(void)

Realm::Hip::GPUTransferCompletion::read_port_idx
int read_port_idx
Definition hip_internal.h:682

Realm::Hip::GPUTransferCompletion::write_size
size_t write_size
Definition hip_internal.h:685

Realm::Hip::GPUTransferCompletion::read_size
size_t read_size
Definition hip_internal.h:683

Realm::Hip::GPUTransferCompletion::GPUTransferCompletion
GPUTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size, int _write_port_idx, size_t _write_offset, size_t _write_size)

Realm::Hip::GPUWorkFence
Definition hip_internal.h:129

Realm::Hip::GPUWorkFence::request_cancellation
virtual void request_cancellation(void)

Realm::Hip::GPUWorkFence::print
virtual void print(std::ostream &os) const

Realm::Hip::GPUWorkFence::cuda_callback
static void cuda_callback(hipStream_t stream, hipError_t res, void *data)

Realm::Hip::GPUWorkFence::fence_list_link
IntrusiveListLink< GPUWorkFence > fence_list_link
Definition hip_internal.h:141

Realm::Hip::GPUWorkFence::GPUWorkFence
GPUWorkFence(Realm::Operation *op)

Realm::Hip::GPUWorkFence::mark_finished
virtual void mark_finished(bool successful)

Realm::Hip::GPUWorkFence::FenceList
IntrusiveList< GPUWorkFence, REALM_PMTA_USE(GPUWorkFence, fence_list_link), DummyLock > FenceList
Definition hip_internal.h:145

Realm::Hip::GPUWorkFence::enqueue_on_stream
void enqueue_on_stream(GPUStream *stream)

Realm::Hip::GPUWorkFence::REALM_PMTA_DEFN
REALM_PMTA_DEFN(GPUWorkFence, IntrusiveListLink< GPUWorkFence >, fence_list_link)

Realm::Hip::GPUWorkStart
Definition hip_internal.h:151

Realm::Hip::GPUWorkStart::mark_gpu_work_start
void mark_gpu_work_start()

Realm::Hip::GPUWorkStart::request_cancellation
virtual void request_cancellation(void)
Definition hip_internal.h:155

Realm::Hip::GPUWorkStart::enqueue_on_stream
void enqueue_on_stream(GPUStream *stream)

Realm::Hip::GPUWorkStart::cuda_start_callback
static void cuda_start_callback(hipStream_t stream, hipError_t res, void *data)

Realm::Hip::GPUWorkStart::GPUWorkStart
GPUWorkStart(Realm::Operation *op)

Realm::Hip::GPUWorkStart::print
virtual void print(std::ostream &os) const

Realm::Hip::GPUWorker
Definition hip_internal.h:227

Realm::Hip::GPUWorker::GPUWorker
GPUWorker(void)

Realm::Hip::GPUWorker::core_rsrv
Realm::CoreReservation * core_rsrv
Definition hip_internal.h:258

Realm::Hip::GPUWorker::ActiveStreamQueue
CircularQueue< GPUStream *, 16 > ActiveStreamQueue
Definition hip_internal.h:254

Realm::Hip::GPUWorker::worker_shutdown_requested
atomic< bool > worker_shutdown_requested
Definition hip_internal.h:261

Realm::Hip::GPUWorker::thread_sleeping
bool thread_sleeping
Definition hip_internal.h:260

Realm::Hip::GPUWorker::thread_main
void thread_main(void)

Realm::Hip::GPUWorker::start_background_thread
void start_background_thread(Realm::CoreReservationSet &crs, size_t stack_size)

Realm::Hip::GPUWorker::active_streams
ActiveStreamQueue active_streams
Definition hip_internal.h:255

Realm::Hip::GPUWorker::do_work
bool do_work(TimeLimit work_until)

Realm::Hip::GPUWorker::~GPUWorker
virtual ~GPUWorker(void)

Realm::Hip::GPUWorker::condvar
Mutex::CondVar condvar
Definition hip_internal.h:252

Realm::Hip::GPUWorker::lock
Mutex lock
Definition hip_internal.h:251

Realm::Hip::GPUWorker::shutdown_background_thread
void shutdown_background_thread(void)

Realm::Hip::GPUWorker::worker_thread
Realm::Thread * worker_thread
Definition hip_internal.h:259

Realm::Hip::GPUWorker::add_stream
void add_stream(GPUStream *s)

Realm::Hip::GPUWorker::process_streams
bool process_streams(bool sleep_on_empty)

Realm::Hip::GPUXferDes
Definition hip_internal.h:690

Realm::Hip::GPUXferDes::GPUXferDes
GPUXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority)

Realm::Hip::GPUXferDes::progress_xd
bool progress_xd(GPUChannel *channel, TimeLimit work_until)

Realm::Hip::GPUXferDes::get_requests
long get_requests(Request **requests, long nr)

Realm::Hip::GPUZCMemory
Definition hip_internal.h:612

Realm::Hip::GPUZCMemory::attempt_register_external_resource
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)

Realm::Hip::GPUZCMemory::cpu_base
char * cpu_base
Definition hip_internal.h:639

Realm::Hip::GPUZCMemory::put_bytes
virtual void put_bytes(off_t offset, const void *src, size_t size)

Realm::Hip::GPUZCMemory::gpu_base
char * gpu_base
Definition hip_internal.h:638

Realm::Hip::GPUZCMemory::GPUZCMemory
GPUZCMemory(RuntimeImpl *_runtime_impl, Memory _me, char *_gpu_base, void *_cpu_base, size_t _size, MemoryKind _kind, Memory::Kind _lowlevel_kind)

Realm::Hip::GPUZCMemory::local_segment
NetworkSegment local_segment
Definition hip_internal.h:640

Realm::Hip::GPUZCMemory::generate_resource_info
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)

Realm::Hip::GPUZCMemory::get_direct_ptr
virtual void * get_direct_ptr(off_t offset, size_t size)

Realm::Hip::GPUZCMemory::unregister_external_resource
virtual void unregister_external_resource(RegionInstanceImpl *inst)

Realm::Hip::GPUZCMemory::~GPUZCMemory
virtual ~GPUZCMemory(void)

Realm::Hip::GPUZCMemory::get_bytes
virtual void get_bytes(off_t offset, void *dst, size_t size)

Realm::Hip::GPU
Definition hip_internal.h:322

Realm::Hip::GPU::device_id
int device_id
Definition hip_internal.h:366

Realm::Hip::GPU::find_stream
GPUStream * find_stream(hipStream_t stream) const

Realm::Hip::GPU::hipipc_mappings
std::vector< HipIpcMapping > hipipc_mappings
Definition hip_internal.h:403

Realm::Hip::GPU::fb_dmem
GPUDynamicFBMemory * fb_dmem
Definition hip_internal.h:362

Realm::Hip::GPU::proc
GPUProcessor * proc
Definition hip_internal.h:360

Realm::Hip::GPU::fb_ibmem_base
char * fb_ibmem_base
Definition hip_internal.h:370

Realm::Hip::GPU::pinned_sysmems
std::set< Memory > pinned_sysmems
Definition hip_internal.h:373

Realm::Hip::GPU::load_hip_module
hipModule_t load_hip_module(const void *data)

Realm::Hip::GPU::find_ipc_mapping
const HipIpcMapping * find_ipc_mapping(Memory mem) const

Realm::Hip::GPU::next_d2d_stream
atomic< unsigned > next_d2d_stream
Definition hip_internal.h:389

Realm::Hip::GPU::info
GPUInfo * info
Definition hip_internal.h:358

Realm::Hip::GPU::device_to_device_stream
GPUStream * device_to_device_stream
Definition hip_internal.h:384

Realm::Hip::GPU::greatest_stream_priority
int greatest_stream_priority
Definition hip_internal.h:395

Realm::Hip::GPU::next_task_stream
atomic< unsigned > next_task_stream
Definition hip_internal.h:388

Realm::Hip::GPU::create_dma_channels
void create_dma_channels(Realm::RuntimeImpl *r)

Realm::Hip::GPU::fbmem_base
char * fbmem_base
Definition hip_internal.h:368

Realm::Hip::GPU::peer_to_peer_streams
std::vector< GPUStream * > peer_to_peer_streams
Definition hip_internal.h:386

Realm::Hip::GPU::pop_context
void pop_context(void)

Realm::Hip::GPU::peer_fbs
std::set< Memory > peer_fbs
Definition hip_internal.h:379

Realm::Hip::GPU::least_stream_priority
int least_stream_priority
Definition hip_internal.h:395

Realm::Hip::GPU::create_fb_memory
void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size)

Realm::Hip::GPU::worker
GPUWorker * worker
Definition hip_internal.h:359

Realm::Hip::GPU::get_null_task_stream
REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUStream * get_null_task_stream(void) const

Realm::Hip::GPU::push_context
void push_context(void)

Realm::Hip::GPU::device_to_device_streams
std::vector< GPUStream * > device_to_device_streams
Definition hip_internal.h:385

Realm::Hip::GPU::fbmem
GPUFBMemory * fbmem
Definition hip_internal.h:361

Realm::Hip::GPU::can_access_peer
bool can_access_peer(GPU *peer)

Realm::Hip::GPU::task_streams
std::vector< GPUStream * > task_streams
Definition hip_internal.h:387

Realm::Hip::GPU::event_pool
GPUEventPool event_pool
Definition hip_internal.h:391

Realm::Hip::GPU::GPU
GPU(HipModule *_module, GPUInfo *_info, GPUWorker *worker, int _device_id)

Realm::Hip::GPU::~GPU
~GPU(void)

Realm::Hip::GPU::device_to_host_stream
GPUStream * device_to_host_stream
Definition hip_internal.h:383

Realm::Hip::GPU::host_to_device_stream
GPUStream * host_to_device_stream
Definition hip_internal.h:382

Realm::Hip::GPU::create_dynamic_fb_memory
void create_dynamic_fb_memory(RuntimeImpl *runtime, size_t max_size)

Realm::Hip::GPU::get_next_d2d_stream
GPUStream * get_next_d2d_stream()

Realm::Hip::GPU::create_processor
void create_processor(RuntimeImpl *runtime, size_t stack_size)

Realm::Hip::GPU::hipipc_streams
std::map< NodeID, GPUStream * > hipipc_streams
Definition hip_internal.h:404

Realm::Hip::GPU::fb_ibmem
GPUFBIBMemory * fb_ibmem
Definition hip_internal.h:363

Realm::Hip::GPU::get_next_task_stream
GPUStream * get_next_task_stream(bool create=false)

Realm::Hip::GPU::managed_mems
std::set< Memory > managed_mems
Definition hip_internal.h:376

Realm::Hip::GPUfillChannel
Definition hip_internal.h:746

Realm::Hip::GPUfillChannel::gpu
GPU * gpu
Definition hip_internal.h:766

Realm::Hip::GPUfillChannel::create_xfer_des
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)

Realm::Hip::GPUfillChannel::submit
long submit(Request **requests, long nr)

Realm::Hip::GPUfillChannel::GPUfillChannel
GPUfillChannel(GPU *_gpu, BackgroundWorkManager *bgwork)

Realm::Hip::GPUfillChannel::is_ordered
static const bool is_ordered
Definition hip_internal.h:751

Realm::Hip::GPUfillXferDes
Definition hip_internal.h:731

Realm::Hip::GPUfillXferDes::GPUfillXferDes
GPUfillXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, const void *_fill_data, size_t _fill_size, size_t _fill_total)

Realm::Hip::GPUfillXferDes::progress_xd
bool progress_xd(GPUfillChannel *channel, TimeLimit work_until)

Realm::Hip::GPUfillXferDes::reduced_fill_size
size_t reduced_fill_size
Definition hip_internal.h:743

Realm::Hip::GPUfillXferDes::get_requests
long get_requests(Request **requests, long nr)

Realm::Hip::GPUreduceChannel
Definition hip_internal.h:793

Realm::Hip::GPUreduceChannel::submit
long submit(Request **requests, long nr)

Realm::Hip::GPUreduceChannel::create_xfer_des
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)

Realm::Hip::GPUreduceChannel::GPUreduceChannel
GPUreduceChannel(GPU *_gpu, BackgroundWorkManager *bgwork)

Realm::Hip::GPUreduceChannel::is_ordered
static const bool is_ordered
Definition hip_internal.h:798

Realm::Hip::GPUreduceChannel::construct_remote_info
virtual RemoteChannelInfo * construct_remote_info() const

Realm::Hip::GPUreduceChannel::gpu
GPU * gpu
Definition hip_internal.h:817

Realm::Hip::GPUreduceChannel::supports_redop
virtual bool supports_redop(ReductionOpID redop_id) const

Realm::Hip::GPUreduceRemoteChannelInfo
Definition hip_internal.h:820

Realm::Hip::GPUreduceRemoteChannelInfo::serdez_subclass
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPUreduceRemoteChannelInfo > serdez_subclass
Definition hip_internal.h:836

Realm::Hip::GPUreduceRemoteChannelInfo::deserialize_new
static RemoteChannelInfo * deserialize_new(S &deserializer)

Realm::Hip::GPUreduceRemoteChannelInfo::create_remote_channel
virtual RemoteChannel * create_remote_channel()

Realm::Hip::GPUreduceRemoteChannelInfo::GPUreduceRemoteChannelInfo
GPUreduceRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths)

Realm::Hip::GPUreduceRemoteChannelInfo::serialize
bool serialize(S &serializer) const

Realm::Hip::GPUreduceRemoteChannel
Definition hip_internal.h:839

Realm::Hip::GPUreduceXferDes
Definition hip_internal.h:771

Realm::Hip::GPUreduceXferDes::progress_xd
bool progress_xd(GPUreduceChannel *channel, TimeLimit work_until)

Realm::Hip::GPUreduceXferDes::kernel_host_proxy
const void * kernel_host_proxy
Definition hip_internal.h:788

Realm::Hip::GPUreduceXferDes::get_requests
long get_requests(Request **requests, long nr)

Realm::Hip::GPUreduceXferDes::GPUreduceXferDes
GPUreduceXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, XferDesRedopInfo _redop_info)

Realm::Hip::GPUreduceXferDes::redop_info
XferDesRedopInfo redop_info
Definition hip_internal.h:783

Realm::Hip::GPUreduceXferDes::stream
GPUStream * stream
Definition hip_internal.h:790

Realm::Hip::GPUreduceXferDes::redop
const ReductionOpUntyped * redop
Definition hip_internal.h:784

Realm::Hip::HipDeviceMemoryInfo
Definition hip_internal.h:531

Realm::Hip::HipDeviceMemoryInfo::HipDeviceMemoryInfo
HipDeviceMemoryInfo(int _device_id)

Realm::Hip::HipDeviceMemoryInfo::gpu
GPU * gpu
Definition hip_internal.h:536

Realm::Hip::HipDeviceMemoryInfo::device_id
int device_id
Definition hip_internal.h:535

Realm::Hip::HipModule
Definition hip_module.h:142

Realm::IBMemory
Definition ib_memory.h:30

Realm::IndexSpaceGeneric
Definition indexspace.h:1115

Realm::IntrusiveList
Definition lists.h:66

Realm::LocalManagedMemory
Definition mem_impl.h:344

Realm::LocalTaskProcessor
Definition proc_impl.h:141

Realm::MemoryImpl
Definition mem_impl.h:50

Realm::MemoryImpl::MemoryKind
MemoryKind
Definition mem_impl.h:53

Realm::MemoryImpl::size
size_t size
Definition mem_impl.h:195

Realm::MemoryImpl::AllocationResult
AllocationResult
Definition mem_impl.h:89

Realm::Memory
Definition memory.h:33

Realm::Memory::Kind
Kind
Definition memory.h:59

Realm::ModuleSpecificInfo
Definition module.h:100

Realm::NetworkSegment
Definition network.h:409

Realm::Operation::AsyncWorkItem
Definition operation.h:75

Realm::Operation::AsyncWorkItem::op
Operation * op
Definition operation.h:87

Realm::Operation
Definition operation.h:32

Realm::Processor
Definition processor.h:37

Realm::Processor::TaskFuncID
::realm_task_func_id_t TaskFuncID
Definition processor.h:58

Realm::RegionInstanceImpl
Definition inst_impl.h:54

Realm::RemoteChannelInfo
Definition channel.h:895

Realm::RemoteChannel
Definition channel.h:938

Realm::ReplicatedHeap::Listener
Definition repl_heap.h:50

Realm::Request
Definition channel.h:102

Realm::RuntimeImpl
Definition runtime_impl.h:267

Realm::Serialization::PolymorphicSerdezSubclass
Definition serialize.h:363

Realm::SimpleRemoteChannelInfo
Definition channel.h:908

Realm::SingleXDQChannel
Definition channel.h:1018

Realm::Thread
Definition threads.h:89

Realm::TimeLimit
Definition timers.h:129

Realm::UnfairCondVar
Definition mutex.h:325

Realm::UnfairMutex
Definition mutex.h:223

Realm::XferDes
Definition channel.h:285

Realm::XferDes::channel
Channel * channel
Definition channel.h:342

Realm::atomic
Definition atomics.h:31

Realm::span
Definition utils.h:84

REALM_INTERNAL_API_EXTERNAL_LINKAGE
#define REALM_INTERNAL_API_EXTERNAL_LINKAGE
Definition compiler_support.h:218

hip_module.h

ib_memory.h

indexspace.h

REALM_PMTA_USE
#define REALM_PMTA_USE(structtype, name)
Definition lists.h:42

mem_impl.h

Realm::Hip::GPUMemcpyKind
GPUMemcpyKind
Definition hip_internal.h:84

Realm::Hip::GPU_MEMCPY_HOST_TO_DEVICE
@ GPU_MEMCPY_HOST_TO_DEVICE
Definition hip_internal.h:85

Realm::Hip::GPU_MEMCPY_PEER_TO_PEER
@ GPU_MEMCPY_PEER_TO_PEER
Definition hip_internal.h:88

Realm::Hip::GPU_MEMCPY_DEVICE_TO_HOST
@ GPU_MEMCPY_DEVICE_TO_HOST
Definition hip_internal.h:86

Realm::Hip::GPU_MEMCPY_DEVICE_TO_DEVICE
@ GPU_MEMCPY_DEVICE_TO_DEVICE
Definition hip_internal.h:87

Realm::Hip::hip_module_singleton
HipModule * hip_module_singleton

Realm
Definition activemsg.h:42

Realm::NodeID
int NodeID
Definition nodeset.h:40

Realm::XferDesKind
XferDesKind
Definition channel.h:84

Realm::XferDesID
unsigned long long XferDesID
Definition channel.h:56

Realm::ReductionOpID
::realm_reduction_op_id_t ReductionOpID
Definition event.h:38

operation.h

proc_impl.h

Realm::Hip::FatBin
Definition hip_hijack.h:39

Realm::Hip::GPUInfo
Definition hip_internal.h:69

Realm::Hip::GPUInfo::totalGlobalMem
size_t totalGlobalMem
Definition hip_internal.h:78

Realm::Hip::GPUInfo::major
int major
Definition hip_internal.h:77

Realm::Hip::GPUInfo::index
int index
Definition hip_internal.h:70

Realm::Hip::GPUInfo::peers
std::set< hipDevice_t > peers
Definition hip_internal.h:80

Realm::Hip::GPUInfo::MAX_NAME_LEN
static const size_t MAX_NAME_LEN
Definition hip_internal.h:73

Realm::Hip::GPUInfo::name
char name[MAX_NAME_LEN]
Definition hip_internal.h:75

Realm::Hip::GPUInfo::minor
int minor
Definition hip_internal.h:77

Realm::Hip::GPUInfo::device
hipDevice_t device
Definition hip_internal.h:71

Realm::Hip::GPUProcessor::CallConfig
Definition hip_internal.h:502

Realm::Hip::GPUProcessor::CallConfig::CallConfig
CallConfig(dim3 _grid, dim3 _block, size_t _shared, hipStream_t _stream)

Realm::Hip::GPUProcessor::CallConfig::stream
hipStream_t stream
Definition hip_internal.h:503

Realm::Hip::GPUProcessor::GPUTaskTableEntry
Definition hip_internal.h:515

Realm::Hip::GPUProcessor::GPUTaskTableEntry::stream_aware_fnptr
Hip::StreamAwareTaskFuncPtr stream_aware_fnptr
Definition hip_internal.h:517

Realm::Hip::GPUProcessor::GPUTaskTableEntry::user_data
ByteArray user_data
Definition hip_internal.h:518

Realm::Hip::GPUProcessor::GPUTaskTableEntry::fnptr
Processor::TaskFuncPtr fnptr
Definition hip_internal.h:516

Realm::Hip::GPUProcessor::LaunchConfig
Definition hip_internal.h:496

Realm::Hip::GPUProcessor::LaunchConfig::shared
size_t shared
Definition hip_internal.h:499

Realm::Hip::GPUProcessor::LaunchConfig::grid
dim3 grid
Definition hip_internal.h:497

Realm::Hip::GPUProcessor::LaunchConfig::block
dim3 block
Definition hip_internal.h:498

Realm::Hip::GPUProcessor::LaunchConfig::LaunchConfig
LaunchConfig(dim3 _grid, dim3 _block, size_t _shared)

Realm::Hip::GPUStream::PendingEvent
Definition hip_internal.h:211

Realm::Hip::GPUStream::PendingEvent::notification
GPUCompletionNotification * notification
Definition hip_internal.h:215

Realm::Hip::GPUStream::PendingEvent::fence
GPUWorkFence * fence
Definition hip_internal.h:213

Realm::Hip::GPUStream::PendingEvent::start
GPUWorkStart * start
Definition hip_internal.h:214

Realm::Hip::GPUStream::PendingEvent::event
hipEvent_t event
Definition hip_internal.h:212

Realm::Hip::GPU::HipIpcMapping
Definition hip_internal.h:397

Realm::Hip::GPU::HipIpcMapping::owner
NodeID owner
Definition hip_internal.h:398

Realm::Hip::GPU::HipIpcMapping::local_base
uintptr_t local_base
Definition hip_internal.h:400

Realm::Hip::GPU::HipIpcMapping::mem
Memory mem
Definition hip_internal.h:399

Realm::Hip::GPU::HipIpcMapping::address_offset
uintptr_t address_offset
Definition hip_internal.h:401

Realm::Hip::HipIpcRelease
Definition hip_internal.h:863

Realm::Hip::HipIpcRelease::handle_message
static void handle_message(NodeID sender, const HipIpcRelease &args, const void *data, size_t datalen)

Realm::Hip::HipIpcRequest
Definition hip_internal.h:847

Realm::Hip::HipIpcRequest::handle_message
static void handle_message(NodeID sender, const HipIpcRequest &args, const void *data, size_t datalen)

Realm::Hip::HipIpcResponse
Definition hip_internal.h:856

Realm::Hip::HipIpcResponse::count
unsigned count
Definition hip_internal.h:857

Realm::Hip::HipIpcResponse::handle_message
static void handle_message(NodeID sender, const HipIpcResponse &args, const void *data, size_t datalen)

Realm::Hip::RegisteredFunction
Definition hip_hijack.h:46

Realm::Hip::RegisteredVariable
Definition hip_hijack.h:55

Realm::IntrusiveListLink
Definition lists.h:53

Realm::ReductionOpUntyped
Definition redop.h:56

Realm::XferDesRedopInfo
Definition channel.h:209

threads.h

src
NodeID src
Definition ucp_internal.h:1