Realm
A distributed, event-based tasking library
Loading...
Searching...
No Matches
hip_internal.h
Go to the documentation of this file.
1/*
2 * Copyright 2026 Stanford University, NVIDIA Corporation, Los Alamos National Laboratory
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef REALM_HIP_INTERNAL_H
19#define REALM_HIP_INTERNAL_H
20
22
23#include <hip/hip_runtime.h>
24
25#include "realm/operation.h"
26#include "realm/threads.h"
27#include "realm/circ_queue.h"
28#include "realm/indexspace.h"
29#include "realm/proc_impl.h"
30#include "realm/mem_impl.h"
31#include "realm/bgwork.h"
34
35#define CHECK_CUDART(cmd) \
36 do { \
37 hipError_t ret = (cmd); \
38 if(ret != hipSuccess) { \
39 fprintf(stderr, "HIP: %s = %d (%s)\n", #cmd, ret, hipGetErrorString(ret)); \
40 assert(0); \
41 exit(1); \
42 } \
43 } while(0)
44
45#define REPORT_HIP_ERROR(cmd, ret) \
46 do { \
47 const char *name, *str; \
48 name = hipGetErrorName(ret); \
49 str = hipGetErrorString(ret); \
50 fprintf(stderr, "HIP: %s = %d (%s): %s\n", cmd, ret, name, str); \
51 abort(); \
52 } while(0)
53
54#define CHECK_HIP(cmd) \
55 do { \
56 hipError_t ret = (cmd); \
57 if(ret != hipSuccess) \
58 REPORT_HIP_ERROR(#cmd, ret); \
59 } while(0)
60
61namespace Realm {
62
63 namespace Hip {
64
65 struct GPUInfo
66#ifdef REALM_USE_HIP_HIJACK
67 : public hipDeviceProp_t
68#endif
69 {
70 int index; // index used by HIP runtime
71 hipDevice_t device;
72
73 static const size_t MAX_NAME_LEN = 64;
74#ifndef REALM_USE_HIP_HIJACK
76
79#endif
80 std::set<hipDevice_t> peers; // other GPUs we can do p2p copies with
81 };
82
90
91 // Forard declaration
92 class GPUProcessor;
93 class GPUWorker;
94 class GPUStream;
95 class GPUFBMemory;
96 class GPUDynamicFBMemory;
97 class GPUZCMemory;
98 class GPUFBIBMemory;
99 class GPU;
100 class HipModule;
101
102 extern HipModule *hip_module_singleton;
103
104 // an interface for receiving completion notification for a GPU operation
105 // (right now, just copies)
107 public:
109
110 virtual void request_completed(void) = 0;
111 };
112
114 public:
116 virtual ~GPUPreemptionWaiter(void) {}
117
118 public:
119 virtual void request_completed(void);
120
121 public:
122 void preempt(void);
123
124 private:
125 GPU *const gpu;
126 Event wait_event;
127 };
128
130 public:
132
133 virtual void mark_finished(bool successful);
134
135 virtual void request_cancellation(void);
136
138
139 virtual void print(std::ostream &os) const;
140
144 DummyLock>
146
147 protected:
148 static void cuda_callback(hipStream_t stream, hipError_t res, void *data);
149 };
150
152 public:
154
155 virtual void request_cancellation(void) { return; };
156
158
159 virtual void print(std::ostream &os) const;
160
162
163 protected:
164 static void cuda_start_callback(hipStream_t stream, hipError_t res, void *data);
165 };
166
167 // a class that represents a HIP stream and work associated with
168 // it (e.g. queued copies, events in flight)
169 // a stream is also associated with a GPUWorker that it will register
170 // with when async work needs doing
171 class GPUStream {
172 public:
173 GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority = 0);
175
176 GPU *get_gpu(void) const;
178 get_stream(void) const; // needed by librealm_kokkos.so
179
180 // may be called by anybody to enqueue a copy or an event
184 void wait_on_streams(const std::set<GPUStream *> &other_streams);
185
186 // atomically checks rate limit counters and returns true if 'bytes'
187 // worth of copies can be submitted or false if not (in which case
188 // the progress counter on the xd will be updated when it should try
189 // again)
190 bool ok_to_submit_copy(size_t bytes, XferDes *xd);
191
192 // to be called by a worker (that should already have the GPU context
193 // current) - returns true if any work remains
194 bool reap_events(TimeLimit work_until);
195
196 protected:
197 // may only be tested with lock held
198 bool has_work(void) const;
199
200 void add_event(hipEvent_t event, GPUWorkFence *fence,
201 GPUCompletionNotification *notification = NULL,
202 GPUWorkStart *start = NULL);
203
206
207 hipStream_t stream;
208
210
217#ifdef USE_CQ
219#else
220 std::deque<PendingEvent> pending_events;
221#endif
222 };
223
224 // a GPUWorker is responsible for making progress on one or more GPUStreams -
225 // this may be done directly by a GPUProcessor or in a background thread
226 // spawned for the purpose
228 public:
230 virtual ~GPUWorker(void);
231
232 // adds a stream that has work to be done
234
235 // used to start a dedicate thread (mutually exclusive with being
236 // registered with a background work manager)
239
240 bool do_work(TimeLimit work_until);
241
242 public:
243 void thread_main(void);
244
245 protected:
246 // used by the background thread
247 // processes work on streams, optionally sleeping for work to show up
248 // returns true if work remains to be done
249 bool process_streams(bool sleep_on_empty);
250
253
256
257 // used by the background thread (if any)
262 };
263
264 // a little helper class to manage a pool of CUevents that can be reused
265 // to reduce alloc/destroy overheads
267 public:
268 GPUEventPool(int _batch_size = 256);
269
270 // allocating the initial batch of events and cleaning up are done with
271 // these methods instead of constructor/destructor because we don't
272 // manage the GPU context in this helper class
273 void init_pool(int init_size = 0 /* default == batch size */);
274 void empty_pool(void);
275
276 hipEvent_t get_event(bool external = false);
277 void return_event(hipEvent_t e, bool external = false);
278
279 protected:
282 std::vector<hipEvent_t> available_events;
283 };
284
285 // when the runtime hijack is not enabled/active, a cuCtxSynchronize
286 // is required to ensure a task's completion event covers all of its
287 // actions - rather than blocking an important thread, we create a
288 // small thread pool to handle these
290 public:
291 ContextSynchronizer(GPU *_gpu, int _device_id, CoreReservationSet &crs,
292 int _max_threads);
294
296
298
300
301 protected:
303 // hipCtx_t context;
311 std::vector<Thread *> worker_threads;
313 };
314
315 struct FatBin;
316 struct RegisteredVariable;
317 struct RegisteredFunction;
318
319 // a GPU object represents our use of a given HIP-capable GPU - this will
320 // have an associated HIP context, a (possibly shared) worker thread, a
321 // processor, and an FB memory (the ZC memory is shared across all GPUs)
322 class GPU {
323 public:
324 GPU(HipModule *_module, GPUInfo *_info, GPUWorker *worker, int _device_id);
325 ~GPU(void);
326
327 void push_context(void);
328 void pop_context(void);
329
330#ifdef REALM_USE_HIP_HIJACK
331 void register_fat_binary(const FatBin *data);
332 void register_variable(const RegisteredVariable *var);
333 void register_function(const RegisteredFunction *func);
334
335 hipFunction_t lookup_function(const void *func);
336 char *lookup_variable(const void *var);
337#endif
338
339 void create_processor(RuntimeImpl *runtime, size_t stack_size);
340 void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size);
341 void create_dynamic_fb_memory(RuntimeImpl *runtime, size_t max_size);
342
344
345 bool can_access_peer(GPU *peer);
346
347 GPUStream *find_stream(hipStream_t stream) const;
349 get_null_task_stream(void) const; // needed by librealm_kokkos.so
350 GPUStream *get_next_task_stream(bool create = false);
352
353 protected:
354 hipModule_t load_hip_module(const void *data);
355
356 public:
357 HipModule *module = nullptr;
358 GPUInfo *info = nullptr;
359 GPUWorker *worker = nullptr;
360 GPUProcessor *proc = nullptr;
361 GPUFBMemory *fbmem = nullptr;
364
365 // hipCtx_t context;
366 int device_id = -1;
367
368 char *fbmem_base = nullptr;
369
370 char *fb_ibmem_base = nullptr;
371
372 // which system memories have been registered and can be used for cuMemcpyAsync
373 std::set<Memory> pinned_sysmems;
374
375 // managed memories we can concurrently access
376 std::set<Memory> managed_mems;
377
378 // which other FBs we have peer access to
379 std::set<Memory> peer_fbs;
380
381 // streams for different copy types and a pile for actual tasks
385 std::vector<GPUStream *> device_to_device_streams;
386 std::vector<GPUStream *> peer_to_peer_streams; // indexed by target
387 std::vector<GPUStream *> task_streams;
390
392
393 // this can technically be different in each context (but probably isn't
394 // in practice)
396
400 uintptr_t local_base;
401 uintptr_t address_offset; // add to convert from original to local base
402 };
403 std::vector<HipIpcMapping> hipipc_mappings;
404 std::map<NodeID, GPUStream *> hipipc_streams;
405
407
408#ifdef REALM_USE_HIP_HIJACK
409 std::map<const FatBin *, hipModule_t> device_modules;
410 std::map<const void *, hipFunction_t> device_functions;
411 std::map<const void *, char *> device_variables;
412#endif
413 };
414
415 // helper to push/pop a GPU's context by scope
417 public:
421
422 protected:
424 };
425
426 class REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUProcessor // needed by librealm_kokkos.so
428 public:
429 GPUProcessor(RuntimeImpl *runtime_impl, GPU *_gpu, Processor _me,
430 Realm::CoreReservationSet &crs, size_t _stack_size);
431 virtual ~GPUProcessor(void);
432
433 public:
434 virtual bool register_task(Processor::TaskFuncID func_id, CodeDescriptor &codedesc,
435 const ByteArrayRef &user_data);
436
437 virtual void shutdown(void);
438
439 protected:
441 const ByteArrayRef &task_args);
442
443 public:
445
446#ifdef REALM_USE_HIP_HIJACK
447 // calls that come from the HIP runtime API
448 void push_call_configuration(dim3 grid_dim, dim3 block_dim, size_t shared_size,
449 void *stream);
450 void pop_call_configuration(dim3 *grid_dim, dim3 *block_dim, size_t *shared_size,
451 void *stream);
452#endif
453
454 void stream_wait_on_event(hipStream_t stream, hipEvent_t event);
455 void stream_synchronize(hipStream_t stream);
457
458#ifdef REALM_USE_HIP_HIJACK
459 void event_create(hipEvent_t *event, int flags);
460 void event_destroy(hipEvent_t event);
461 void event_record(hipEvent_t event, hipStream_t stream);
462 void event_synchronize(hipEvent_t event);
463 void event_elapsed_time(float *ms, hipEvent_t start, hipEvent_t end);
464
465 void configure_call(dim3 grid_dim, dim3 block_dim, size_t shared_memory,
466 hipStream_t stream);
467 void setup_argument(const void *arg, size_t size, size_t offset);
468 void launch(const void *func);
469 void launch_kernel(const void *func, dim3 grid_dim, dim3 block_dim, void **args,
470 size_t shared_memory, hipStream_t stream);
471#endif
472
473 void gpu_memcpy(void *dst, const void *src, size_t size, hipMemcpyKind kind);
474 void gpu_memcpy_async(void *dst, const void *src, size_t size, hipMemcpyKind kind,
475 hipStream_t stream);
476#ifdef REALM_USE_HIP_HIJACK
477 void gpu_memcpy_to_symbol(const void *dst, const void *src, size_t size,
478 size_t offset, hipMemcpyKind kind);
479 void gpu_memcpy_to_symbol_async(const void *dst, const void *src, size_t size,
480 size_t offset, hipMemcpyKind kind,
481 hipStream_t stream);
482 void gpu_memcpy_from_symbol(void *dst, const void *src, size_t size, size_t offset,
483 hipMemcpyKind kind);
484 void gpu_memcpy_from_symbol_async(void *dst, const void *src, size_t size,
485 size_t offset, hipMemcpyKind kind,
486 hipStream_t stream);
487#endif
488
489 void gpu_memset(void *dst, int value, size_t count);
490 void gpu_memset_async(void *dst, int value, size_t count, hipStream_t stream);
491
492 public:
494
495 // data needed for kernel launches
497 dim3 grid;
498 dim3 block;
499 size_t shared;
500 LaunchConfig(dim3 _grid, dim3 _block, size_t _shared);
501 };
502 struct CallConfig : public LaunchConfig {
503 hipStream_t stream;
504 CallConfig(dim3 _grid, dim3 _block, size_t _shared, hipStream_t _stream);
505 };
506 std::vector<CallConfig> launch_configs;
507 std::vector<char> kernel_args;
508 std::vector<CallConfig> call_configs;
511
512 protected:
514
516 Processor::TaskFuncPtr fnptr;
517 Hip::StreamAwareTaskFuncPtr stream_aware_fnptr;
519 };
520
521 // we're not using the parent's task table, but we can use the mutex
522 // RWLock task_table_mutex;
523 std::map<Processor::TaskFuncID, GPUTaskTableEntry> gpu_task_table;
524 };
525
526 // this can be attached to any MemoryImpl if the underlying memory is
527 // guaranteed to belong to a given device - this will allow that
528 // context's processor and dma channels to work with it
529 // the creator is expected to know what device they want but need
530 // not know which GPU object that corresponds to
532 public:
533 HipDeviceMemoryInfo(int _device_id);
534
537 };
538
540 public:
541 GPUFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, char *_base,
542 size_t _size);
543
544 virtual ~GPUFBMemory(void);
545
546 // these work, but they are SLOW
547 virtual void get_bytes(off_t offset, void *dst, size_t size);
548 virtual void put_bytes(off_t offset, const void *src, size_t size);
549
550 virtual void *get_direct_ptr(off_t offset, size_t size);
551
552 // GPUFBMemory supports ExternalHipMemoryResource and
553 // ExternalHipArrayResource (not implemented)
555 size_t &inst_offset);
557
558 // for re-registration purposes, generate an ExternalInstanceResource *
559 // (if possible) for a given instance, or a subset of one
562 span<const FieldID> fields, bool read_only);
563
564 public:
566 char *base;
568 };
569
571 public:
572 GPUDynamicFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu,
573 size_t _max_size);
574
575 virtual ~GPUDynamicFBMemory(void);
576 void cleanup(void);
577
578 // deferred allocation not supported
580 bool need_alloc_result,
581 bool poisoned,
582 TimeLimit work_until);
583
584 virtual void release_storage_immediate(RegionInstanceImpl *inst, bool poisoned,
585 TimeLimit work_until);
586
587 // these work, but they are SLOW
588 virtual void get_bytes(off_t offset, void *dst, size_t size);
589 virtual void put_bytes(off_t offset, const void *src, size_t size);
590
591 virtual void *get_direct_ptr(off_t offset, size_t size);
592
593 // GPUDynamicFBMemory supports ExternalHipMemoryResource and
594 // ExternalHipArrayResource (not implemented)
596 size_t &inst_offset);
598
599 // for re-registration purposes, generate an ExternalInstanceResource *
600 // (if possible) for a given instance, or a subset of one
603 span<const FieldID> fields, bool read_only);
604
605 public:
608 size_t cur_size;
609 std::map<RegionInstance, std::pair<void *, size_t>> alloc_bases;
610 };
611
613 public:
614 GPUZCMemory(RuntimeImpl *_runtime_impl, Memory _me, char *_gpu_base,
615 void *_cpu_base, size_t _size, MemoryKind _kind,
616 Memory::Kind _lowlevel_kind);
617
618 virtual ~GPUZCMemory(void);
619
620 virtual void get_bytes(off_t offset, void *dst, size_t size);
621
622 virtual void put_bytes(off_t offset, const void *src, size_t size);
623
624 virtual void *get_direct_ptr(off_t offset, size_t size);
625
626 // GPUZCMemory supports ExternalHipPinnedHostResource
628 size_t &inst_offset);
630
631 // for re-registration purposes, generate an ExternalInstanceResource *
632 // (if possible) for a given instance, or a subset of one
635 span<const FieldID> fields, bool read_only);
636
637 public:
638 char *gpu_base;
639 char *cpu_base;
641 };
642
643 class GPUFBIBMemory : public IBMemory {
644 public:
645 GPUFBIBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, char *_base,
646 size_t _size);
647
648 public:
650 char *base;
652 };
653
654 class GPURequest;
655
657 public:
659
661 };
662
663 class GPURequest : public Request {
664 public:
665 const void *src_base;
666 void *dst_base;
667 // off_t src_gpu_off, dst_gpu_off;
670 };
671
673 public:
674 GPUTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset,
675 size_t _read_size, int _write_port_idx, size_t _write_offset,
676 size_t _write_size);
677
678 virtual void request_completed(void);
679
680 protected:
686 };
687
688 class GPUChannel;
689
690 class GPUXferDes : public XferDes {
691 public:
692 GPUXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,
693 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,
694 const std::vector<XferDesPortInfo> &outputs_info, int _priority);
695
696 long get_requests(Request **requests, long nr);
697
699
700 private:
701 std::vector<GPU *> src_gpus, dst_gpus;
702 std::vector<bool> dst_is_ipc;
703 };
704
705 class GPUChannel : public SingleXDQChannel<GPUChannel, GPUXferDes> {
706 public:
709
710 // multi-threading of cuda copies for a given device is disabled by
711 // default (can be re-enabled with -cuda:mtdma 1)
712 static const bool is_ordered = true;
713
714 virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,
715 XferDesID guid,
716 const std::vector<XferDesPortInfo> &inputs_info,
717 const std::vector<XferDesPortInfo> &outputs_info,
718 int priority, XferDesRedopInfo redop_info,
719 const void *fill_data, size_t fill_size,
720 size_t fill_total);
721
722 long submit(Request **requests, long nr);
723
724 private:
725 GPU *src_gpu;
726 // std::deque<Request*> pending_copies;
727 };
728
729 class GPUfillChannel;
730
731 class GPUfillXferDes : public XferDes {
732 public:
733 GPUfillXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,
734 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,
735 const std::vector<XferDesPortInfo> &outputs_info, int _priority,
736 const void *_fill_data, size_t _fill_size, size_t _fill_total);
737
738 long get_requests(Request **requests, long nr);
739
741
742 protected:
744 };
745
746 class GPUfillChannel : public SingleXDQChannel<GPUfillChannel, GPUfillXferDes> {
747 public:
749
750 // multiple concurrent cuda fills ok
751 static const bool is_ordered = false;
752
753 virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,
754 XferDesID guid,
755 const std::vector<XferDesPortInfo> &inputs_info,
756 const std::vector<XferDesPortInfo> &outputs_info,
757 int priority, XferDesRedopInfo redop_info,
758 const void *fill_data, size_t fill_size,
759 size_t fill_total);
760
761 long submit(Request **requests, long nr);
762
763 protected:
764 friend class GPUfillXferDes;
765
767 };
768
769 class GPUreduceChannel;
770
771 class GPUreduceXferDes : public XferDes {
772 public:
773 GPUreduceXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node,
774 XferDesID _guid, const std::vector<XferDesPortInfo> &inputs_info,
775 const std::vector<XferDesPortInfo> &outputs_info, int _priority,
776 XferDesRedopInfo _redop_info);
777
778 long get_requests(Request **requests, long nr);
779
781
782 protected:
785#if defined(REALM_USE_HIP_HIJACK)
786 void *kernel;
787#else
788 const void *kernel_host_proxy;
789#endif
791 };
792
793 class GPUreduceChannel : public SingleXDQChannel<GPUreduceChannel, GPUreduceXferDes> {
794 public:
796
797 // multiple concurrent cuda reduces ok
798 static const bool is_ordered = false;
799
800 virtual bool supports_redop(ReductionOpID redop_id) const;
801
803
804 virtual XferDes *create_xfer_des(uintptr_t dma_op, NodeID launch_node,
805 XferDesID guid,
806 const std::vector<XferDesPortInfo> &inputs_info,
807 const std::vector<XferDesPortInfo> &outputs_info,
808 int priority, XferDesRedopInfo redop_info,
809 const void *fill_data, size_t fill_size,
810 size_t fill_total);
811
812 long submit(Request **requests, long nr);
813
814 protected:
815 friend class GPUreduceXferDes;
816
818 };
819
821 public:
822 GPUreduceRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr,
823 const std::vector<Channel::SupportedPath> &_paths);
824
826
827 template <typename S>
828 bool serialize(S &serializer) const;
829
830 template <typename S>
831 static RemoteChannelInfo *deserialize_new(S &deserializer);
832
833 protected:
837 };
838
841
842 GPUreduceRemoteChannel(uintptr_t _remote_ptr);
843 };
844
845 // active messages for establishing cuda ipc mappings
846
848#ifdef REALM_ON_LINUX
849 long hostid; // POSIX hostid
850#endif
851
852 static void handle_message(NodeID sender, const HipIpcRequest &args,
853 const void *data, size_t datalen);
854 };
855
857 unsigned count;
858
859 static void handle_message(NodeID sender, const HipIpcResponse &args,
860 const void *data, size_t datalen);
861 };
862
864
865 static void handle_message(NodeID sender, const HipIpcRelease &args,
866 const void *data, size_t datalen);
867 };
868
870 public:
872
873 virtual void chunk_created(void *base, size_t bytes);
874 virtual void chunk_destroyed(void *base, size_t bytes);
875
876 protected:
877 HipModule *module;
878 };
879
880 }; // namespace Hip
881
882}; // namespace Realm
883
884#endif
Definition bgwork.h:129
Definition bgwork.h:36
Definition bytearray.h:30
Definition bytearray.h:53
Definition channel.h:713
Definition circ_queue.h:35
Definition codedesc.h:249
Definition threads.h:382
Definition threads.h:342
Definition threads.h:428
Definition event.h:50
Definition instance.h:405
Definition hip_internal.h:416
GPU * gpu
Definition hip_internal.h:423
Definition hip_internal.h:289
Mutex::CondVar condvar
Definition hip_internal.h:307
bool shutdown_flag
Definition hip_internal.h:308
int max_threads
Definition hip_internal.h:305
GPU * gpu
Definition hip_internal.h:302
int total_threads
Definition hip_internal.h:310
ContextSynchronizer(GPU *_gpu, int _device_id, CoreReservationSet &crs, int _max_threads)
GPUWorkFence::FenceList fences
Definition hip_internal.h:309
int device_id
Definition hip_internal.h:304
CoreReservation * core_rsrv
Definition hip_internal.h:312
std::vector< Thread * > worker_threads
Definition hip_internal.h:311
int sleeping_threads
Definition hip_internal.h:310
int syncing_threads
Definition hip_internal.h:310
void add_fence(GPUWorkFence *fence)
Mutex mutex
Definition hip_internal.h:306
Definition hip_internal.h:705
long submit(Request **requests, long nr)
GPUChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork)
static const bool is_ordered
Definition hip_internal.h:712
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
Definition hip_internal.h:656
GPURequest * req
Definition hip_internal.h:660
Definition hip_internal.h:106
virtual ~GPUCompletionNotification(void)
Definition hip_internal.h:108
virtual void request_completed(void)=0
Definition hip_internal.h:570
GPU * gpu
Definition hip_internal.h:606
virtual void unregister_external_resource(RegionInstanceImpl *inst)
std::map< RegionInstance, std::pair< void *, size_t > > alloc_bases
Definition hip_internal.h:609
size_t cur_size
Definition hip_internal.h:608
Mutex mutex
Definition hip_internal.h:607
virtual AllocationResult allocate_storage_immediate(RegionInstanceImpl *inst, bool need_alloc_result, bool poisoned, TimeLimit work_until)
virtual void get_bytes(off_t offset, void *dst, size_t size)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
virtual void put_bytes(off_t offset, const void *src, size_t size)
virtual void * get_direct_ptr(off_t offset, size_t size)
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
GPUDynamicFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, size_t _max_size)
virtual void release_storage_immediate(RegionInstanceImpl *inst, bool poisoned, TimeLimit work_until)
Definition hip_internal.h:266
int external_count
Definition hip_internal.h:281
void init_pool(int init_size=0)
int batch_size
Definition hip_internal.h:281
int current_size
Definition hip_internal.h:281
GPUEventPool(int _batch_size=256)
int total_size
Definition hip_internal.h:281
hipEvent_t get_event(bool external=false)
Mutex mutex
Definition hip_internal.h:280
std::vector< hipEvent_t > available_events
Definition hip_internal.h:282
void return_event(hipEvent_t e, bool external=false)
Definition hip_internal.h:643
char * base
Definition hip_internal.h:650
GPU * gpu
Definition hip_internal.h:649
GPUFBIBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, char *_base, size_t _size)
NetworkSegment local_segment
Definition hip_internal.h:651
Definition hip_internal.h:539
virtual void unregister_external_resource(RegionInstanceImpl *inst)
virtual void put_bytes(off_t offset, const void *src, size_t size)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
virtual void get_bytes(off_t offset, void *dst, size_t size)
NetworkSegment local_segment
Definition hip_internal.h:567
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void * get_direct_ptr(off_t offset, size_t size)
char * base
Definition hip_internal.h:566
virtual ~GPUFBMemory(void)
GPU * gpu
Definition hip_internal.h:565
GPUFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, char *_base, size_t _size)
Definition hip_internal.h:113
virtual ~GPUPreemptionWaiter(void)
Definition hip_internal.h:116
virtual void request_completed(void)
Definition hip_internal.h:427
ContextSynchronizer ctxsync
Definition hip_internal.h:510
void gpu_memcpy_async(void *dst, const void *src, size_t size, hipMemcpyKind kind, hipStream_t stream)
void gpu_memset(void *dst, int value, size_t count)
std::vector< CallConfig > launch_configs
Definition hip_internal.h:506
virtual void shutdown(void)
void gpu_memset_async(void *dst, int value, size_t count, hipStream_t stream)
virtual bool register_task(Processor::TaskFuncID func_id, CodeDescriptor &codedesc, const ByteArrayRef &user_data)
GPUProcessor(RuntimeImpl *runtime_impl, GPU *_gpu, Processor _me, Realm::CoreReservationSet &crs, size_t _stack_size)
void gpu_memcpy(void *dst, const void *src, size_t size, hipMemcpyKind kind)
std::vector< CallConfig > call_configs
Definition hip_internal.h:508
static GPUProcessor * get_current_gpu_proc(void)
std::vector< char > kernel_args
Definition hip_internal.h:507
virtual ~GPUProcessor(void)
void stream_synchronize(hipStream_t stream)
virtual void execute_task(Processor::TaskFuncID func_id, const ByteArrayRef &task_args)
bool block_on_synchronize
Definition hip_internal.h:509
void stream_wait_on_event(hipStream_t stream, hipEvent_t event)
GPU * gpu
Definition hip_internal.h:493
std::map< Processor::TaskFuncID, GPUTaskTableEntry > gpu_task_table
Definition hip_internal.h:523
Realm::CoreReservation * core_rsrv
Definition hip_internal.h:513
Definition hip_internal.h:869
virtual void chunk_destroyed(void *base, size_t bytes)
virtual void chunk_created(void *base, size_t bytes)
GPUReplHeapListener(HipModule *_module)
Definition hip_internal.h:663
void * dst_base
Definition hip_internal.h:666
const void * src_base
Definition hip_internal.h:665
GPUCompletionEvent event
Definition hip_internal.h:669
GPU * dst_gpu
Definition hip_internal.h:668
Definition hip_internal.h:171
void add_fence(GPUWorkFence *fence)
void add_start_event(GPUWorkStart *start)
GPUWorker * worker
Definition hip_internal.h:205
std::deque< PendingEvent > pending_events
Definition hip_internal.h:220
void add_event(hipEvent_t event, GPUWorkFence *fence, GPUCompletionNotification *notification=NULL, GPUWorkStart *start=NULL)
bool has_work(void) const
GPU * gpu
Definition hip_internal.h:204
void add_notification(GPUCompletionNotification *notification)
bool reap_events(TimeLimit work_until)
hipStream_t stream
Definition hip_internal.h:207
Mutex mutex
Definition hip_internal.h:209
void wait_on_streams(const std::set< GPUStream * > &other_streams)
GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority=0)
REALM_INTERNAL_API_EXTERNAL_LINKAGE hipStream_t get_stream(void) const
bool ok_to_submit_copy(size_t bytes, XferDes *xd)
GPU * get_gpu(void) const
Definition hip_internal.h:672
int write_port_idx
Definition hip_internal.h:684
size_t read_offset
Definition hip_internal.h:683
size_t write_offset
Definition hip_internal.h:685
XferDes * xd
Definition hip_internal.h:681
virtual void request_completed(void)
int read_port_idx
Definition hip_internal.h:682
size_t write_size
Definition hip_internal.h:685
size_t read_size
Definition hip_internal.h:683
GPUTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size, int _write_port_idx, size_t _write_offset, size_t _write_size)
Definition hip_internal.h:129
virtual void request_cancellation(void)
virtual void print(std::ostream &os) const
static void cuda_callback(hipStream_t stream, hipError_t res, void *data)
IntrusiveListLink< GPUWorkFence > fence_list_link
Definition hip_internal.h:141
GPUWorkFence(Realm::Operation *op)
virtual void mark_finished(bool successful)
IntrusiveList< GPUWorkFence, REALM_PMTA_USE(GPUWorkFence, fence_list_link), DummyLock > FenceList
Definition hip_internal.h:145
void enqueue_on_stream(GPUStream *stream)
REALM_PMTA_DEFN(GPUWorkFence, IntrusiveListLink< GPUWorkFence >, fence_list_link)
Definition hip_internal.h:151
virtual void request_cancellation(void)
Definition hip_internal.h:155
void enqueue_on_stream(GPUStream *stream)
static void cuda_start_callback(hipStream_t stream, hipError_t res, void *data)
GPUWorkStart(Realm::Operation *op)
virtual void print(std::ostream &os) const
Definition hip_internal.h:227
Realm::CoreReservation * core_rsrv
Definition hip_internal.h:258
CircularQueue< GPUStream *, 16 > ActiveStreamQueue
Definition hip_internal.h:254
atomic< bool > worker_shutdown_requested
Definition hip_internal.h:261
bool thread_sleeping
Definition hip_internal.h:260
void start_background_thread(Realm::CoreReservationSet &crs, size_t stack_size)
ActiveStreamQueue active_streams
Definition hip_internal.h:255
bool do_work(TimeLimit work_until)
virtual ~GPUWorker(void)
Mutex::CondVar condvar
Definition hip_internal.h:252
Mutex lock
Definition hip_internal.h:251
void shutdown_background_thread(void)
Realm::Thread * worker_thread
Definition hip_internal.h:259
void add_stream(GPUStream *s)
bool process_streams(bool sleep_on_empty)
Definition hip_internal.h:690
GPUXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority)
bool progress_xd(GPUChannel *channel, TimeLimit work_until)
long get_requests(Request **requests, long nr)
Definition hip_internal.h:612
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
char * cpu_base
Definition hip_internal.h:639
virtual void put_bytes(off_t offset, const void *src, size_t size)
char * gpu_base
Definition hip_internal.h:638
GPUZCMemory(RuntimeImpl *_runtime_impl, Memory _me, char *_gpu_base, void *_cpu_base, size_t _size, MemoryKind _kind, Memory::Kind _lowlevel_kind)
NetworkSegment local_segment
Definition hip_internal.h:640
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
virtual void * get_direct_ptr(off_t offset, size_t size)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
virtual ~GPUZCMemory(void)
virtual void get_bytes(off_t offset, void *dst, size_t size)
Definition hip_internal.h:322
int device_id
Definition hip_internal.h:366
GPUStream * find_stream(hipStream_t stream) const
std::vector< HipIpcMapping > hipipc_mappings
Definition hip_internal.h:403
GPUDynamicFBMemory * fb_dmem
Definition hip_internal.h:362
GPUProcessor * proc
Definition hip_internal.h:360
char * fb_ibmem_base
Definition hip_internal.h:370
std::set< Memory > pinned_sysmems
Definition hip_internal.h:373
hipModule_t load_hip_module(const void *data)
const HipIpcMapping * find_ipc_mapping(Memory mem) const
atomic< unsigned > next_d2d_stream
Definition hip_internal.h:389
GPUInfo * info
Definition hip_internal.h:358
GPUStream * device_to_device_stream
Definition hip_internal.h:384
int greatest_stream_priority
Definition hip_internal.h:395
atomic< unsigned > next_task_stream
Definition hip_internal.h:388
void create_dma_channels(Realm::RuntimeImpl *r)
char * fbmem_base
Definition hip_internal.h:368
std::vector< GPUStream * > peer_to_peer_streams
Definition hip_internal.h:386
void pop_context(void)
std::set< Memory > peer_fbs
Definition hip_internal.h:379
int least_stream_priority
Definition hip_internal.h:395
void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size)
GPUWorker * worker
Definition hip_internal.h:359
REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUStream * get_null_task_stream(void) const
void push_context(void)
std::vector< GPUStream * > device_to_device_streams
Definition hip_internal.h:385
GPUFBMemory * fbmem
Definition hip_internal.h:361
bool can_access_peer(GPU *peer)
std::vector< GPUStream * > task_streams
Definition hip_internal.h:387
GPUEventPool event_pool
Definition hip_internal.h:391
GPU(HipModule *_module, GPUInfo *_info, GPUWorker *worker, int _device_id)
GPUStream * device_to_host_stream
Definition hip_internal.h:383
GPUStream * host_to_device_stream
Definition hip_internal.h:382
void create_dynamic_fb_memory(RuntimeImpl *runtime, size_t max_size)
GPUStream * get_next_d2d_stream()
void create_processor(RuntimeImpl *runtime, size_t stack_size)
std::map< NodeID, GPUStream * > hipipc_streams
Definition hip_internal.h:404
GPUFBIBMemory * fb_ibmem
Definition hip_internal.h:363
GPUStream * get_next_task_stream(bool create=false)
std::set< Memory > managed_mems
Definition hip_internal.h:376
Definition hip_internal.h:746
GPU * gpu
Definition hip_internal.h:766
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
long submit(Request **requests, long nr)
GPUfillChannel(GPU *_gpu, BackgroundWorkManager *bgwork)
static const bool is_ordered
Definition hip_internal.h:751
Definition hip_internal.h:731
GPUfillXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, const void *_fill_data, size_t _fill_size, size_t _fill_total)
bool progress_xd(GPUfillChannel *channel, TimeLimit work_until)
size_t reduced_fill_size
Definition hip_internal.h:743
long get_requests(Request **requests, long nr)
Definition hip_internal.h:793
long submit(Request **requests, long nr)
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
GPUreduceChannel(GPU *_gpu, BackgroundWorkManager *bgwork)
static const bool is_ordered
Definition hip_internal.h:798
virtual RemoteChannelInfo * construct_remote_info() const
GPU * gpu
Definition hip_internal.h:817
virtual bool supports_redop(ReductionOpID redop_id) const
Definition hip_internal.h:820
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPUreduceRemoteChannelInfo > serdez_subclass
Definition hip_internal.h:836
static RemoteChannelInfo * deserialize_new(S &deserializer)
virtual RemoteChannel * create_remote_channel()
GPUreduceRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths)
bool serialize(S &serializer) const
Definition hip_internal.h:839
Definition hip_internal.h:771
bool progress_xd(GPUreduceChannel *channel, TimeLimit work_until)
const void * kernel_host_proxy
Definition hip_internal.h:788
long get_requests(Request **requests, long nr)
GPUreduceXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, XferDesRedopInfo _redop_info)
XferDesRedopInfo redop_info
Definition hip_internal.h:783
GPUStream * stream
Definition hip_internal.h:790
const ReductionOpUntyped * redop
Definition hip_internal.h:784
Definition hip_internal.h:531
HipDeviceMemoryInfo(int _device_id)
GPU * gpu
Definition hip_internal.h:536
int device_id
Definition hip_internal.h:535
Definition hip_module.h:142
Definition ib_memory.h:30
Definition indexspace.h:1115
Definition lists.h:66
Definition mem_impl.h:344
Definition proc_impl.h:141
Definition mem_impl.h:50
MemoryKind
Definition mem_impl.h:53
size_t size
Definition mem_impl.h:195
AllocationResult
Definition mem_impl.h:89
Definition memory.h:33
Kind
Definition memory.h:59
Definition module.h:100
Definition network.h:262
Definition operation.h:75
Operation * op
Definition operation.h:87
Definition operation.h:32
Definition processor.h:37
::realm_task_func_id_t TaskFuncID
Definition processor.h:58
Definition inst_impl.h:54
Definition channel.h:891
Definition channel.h:934
Definition repl_heap.h:50
Definition channel.h:103
Definition runtime_impl.h:264
Definition channel.h:904
Definition channel.h:1014
Definition threads.h:89
Definition timers.h:129
Definition mutex.h:325
Definition mutex.h:223
Definition channel.h:286
Channel * channel
Definition channel.h:343
Definition atomics.h:31
Definition utils.h:84
#define REALM_INTERNAL_API_EXTERNAL_LINKAGE
Definition compiler_support.h:218
#define REALM_PMTA_USE(structtype, name)
Definition lists.h:42
GPUMemcpyKind
Definition hip_internal.h:84
@ GPU_MEMCPY_HOST_TO_DEVICE
Definition hip_internal.h:85
@ GPU_MEMCPY_PEER_TO_PEER
Definition hip_internal.h:88
@ GPU_MEMCPY_DEVICE_TO_HOST
Definition hip_internal.h:86
@ GPU_MEMCPY_DEVICE_TO_DEVICE
Definition hip_internal.h:87
HipModule * hip_module_singleton
Definition activemsg.h:38
int NodeID
Definition nodeset.h:40
XferDesKind
Definition channel.h:85
unsigned long long XferDesID
Definition channel.h:57
::realm_reduction_op_id_t ReductionOpID
Definition event.h:38
Definition hip_hijack.h:39
Definition hip_internal.h:69
size_t totalGlobalMem
Definition hip_internal.h:78
int major
Definition hip_internal.h:77
int index
Definition hip_internal.h:70
std::set< hipDevice_t > peers
Definition hip_internal.h:80
static const size_t MAX_NAME_LEN
Definition hip_internal.h:73
char name[MAX_NAME_LEN]
Definition hip_internal.h:75
int minor
Definition hip_internal.h:77
hipDevice_t device
Definition hip_internal.h:71
Definition hip_internal.h:502
CallConfig(dim3 _grid, dim3 _block, size_t _shared, hipStream_t _stream)
hipStream_t stream
Definition hip_internal.h:503
Definition hip_internal.h:515
Hip::StreamAwareTaskFuncPtr stream_aware_fnptr
Definition hip_internal.h:517
ByteArray user_data
Definition hip_internal.h:518
Processor::TaskFuncPtr fnptr
Definition hip_internal.h:516
Definition hip_internal.h:496
size_t shared
Definition hip_internal.h:499
dim3 grid
Definition hip_internal.h:497
dim3 block
Definition hip_internal.h:498
LaunchConfig(dim3 _grid, dim3 _block, size_t _shared)
Definition hip_internal.h:211
GPUCompletionNotification * notification
Definition hip_internal.h:215
GPUWorkFence * fence
Definition hip_internal.h:213
GPUWorkStart * start
Definition hip_internal.h:214
hipEvent_t event
Definition hip_internal.h:212
Definition hip_internal.h:397
NodeID owner
Definition hip_internal.h:398
uintptr_t local_base
Definition hip_internal.h:400
Memory mem
Definition hip_internal.h:399
uintptr_t address_offset
Definition hip_internal.h:401
Definition hip_internal.h:863
static void handle_message(NodeID sender, const HipIpcRelease &args, const void *data, size_t datalen)
Definition hip_internal.h:847
static void handle_message(NodeID sender, const HipIpcRequest &args, const void *data, size_t datalen)
Definition hip_internal.h:856
unsigned count
Definition hip_internal.h:857
static void handle_message(NodeID sender, const HipIpcResponse &args, const void *data, size_t datalen)
Definition hip_hijack.h:46
Definition hip_hijack.h:55
Definition redop.h:56
Definition channel.h:210
NodeID src
Definition ucp_internal.h:1