18#ifndef REALM_CUDA_INTERNAL_H
19#define REALM_CUDA_INTERNAL_H
24#include <unordered_map>
25#if !defined(CUDA_ENABLE_DEPRECATED)
27#define CUDA_ENABLE_DEPRECATED 1
32#if defined(REALM_USE_CUDART_HIJACK)
33#include <cuda_runtime_api.h>
37#include <vector_types.h>
50#if CUDART_VERSION < 11000
51#define CHECK_CUDART(cmd) \
53 int ret = (int)(cmd); \
55 fprintf(stderr, "CUDART: %s = %d\n", #cmd, ret); \
62#define CHECK_CUDART(cmd) CHECK_CU((CUresult)(cmd))
66#if CUDA_VERSION >= 6050
67#define REPORT_CU_ERROR(level, cmd, ret) \
69 const char *name, *str; \
70 CUDA_DRIVER_FNPTR(Realm::Cuda::cuGetErrorName)(ret, &name); \
71 CUDA_DRIVER_FNPTR(Realm::Cuda::cuGetErrorString)(ret, &str); \
72 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret \
73 << '(' << name << "): " << str; \
76#define REPORT_CU_ERROR(level, cmd, ret) \
78 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret \
82#define CHECK_CU(cmd) \
84 CUresult ret = (cmd); \
85 if(ret != CUDA_SUCCESS) { \
86 REPORT_CU_ERROR(Logger::LEVEL_ERROR, #cmd, ret); \
91#define REPORT_NVML_ERROR(level, cmd, ret) \
93 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret; \
96#define CHECK_NVML(cmd) \
98 nvmlReturn_t ret = (cmd); \
99 if(ret != NVML_SUCCESS) { \
100 REPORT_NVML_ERROR(Logger::LEVEL_ERROR, #cmd, ret); \
105#define IS_DEFAULT_STREAM(stream) \
106 (((stream) == 0) || ((stream) == CU_STREAM_LEGACY) || \
107 ((stream) == CU_STREAM_PER_THREAD))
109#define REPORT_CUPTI_ERROR(level, cmd, ret) \
111 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret; \
114#define CHECK_CUPTI(cmd) \
116 CUptiResult ret = (cmd); \
117 if(ret != CUPTI_SUCCESS) { \
118 REPORT_CUPTI_ERROR(Logger::LEVEL_ERROR, #cmd, ret); \
156#ifdef REALM_USE_CUDART_HIJACK
174 class GPUDynamicFBMemory;
214 virtual void print(std::ostream &os)
const;
235 virtual void print(std::ostream &os)
const;
424 size_t field_size,
size_t volume,
430 CUfunction apply_nonexcl, CUfunction fold_excl,
431 CUfunction fold_nonexcl);
601 size_t &inst_offset);
626 bool need_alloc_result,
642 size_t &inst_offset);
655 std::map<RegionInstance, std::pair<CUdeviceptr, size_t>>
alloc_bases;
662 void *_cpu_base,
size_t _size,
MemoryKind _kind,
675 size_t &inst_offset);
722 XferDes *_xd,
int _read_port_idx,
size_t _read_offset,
size_t _read_size,
723 int _write_port_idx,
size_t _write_offset,
size_t _write_size,
724 int _read_ind_port_idx = -1,
size_t _read_ind_offset = 0,
725 size_t _read_ind_size = 0,
int _write_ind_port_idx = -1,
726 size_t _write_ind_offset = 0,
size_t _write_ind_size = 0);
745 size_t _read_size,
int _write_port_idx,
size_t _write_offset,
770 size_t field_offset,
int ndims,
const int64_t lo[],
771 const int64_t hi[],
const int order[]);
784 XferDesID _guid,
const std::vector<XferDesPortInfo> &inputs_info,
785 const std::vector<XferDesPortInfo> &outputs_info,
int _priority);
792 std::vector<GPU *> src_gpus, dst_gpus;
793 std::vector<bool> dst_is_ipc;
801 XferDesID _guid,
const std::vector<XferDesPortInfo> &inputs_info,
802 const std::vector<XferDesPortInfo> &outputs_info,
int _priority,
831 size_t total_bytes,
const std::vector<size_t> *src_frags,
832 const std::vector<size_t> *dst_frags,
XferDesKind *kind_ret = 0,
833 unsigned *bw_ret = 0,
unsigned *lat_ret = 0);
839 const std::vector<XferDesPortInfo> &inputs_info,
840 const std::vector<XferDesPortInfo> &outputs_info,
842 const void *fill_data,
size_t fill_size,
856 uintptr_t _remote_ptr,
857 const std::vector<Channel::SupportedPath> &_paths,
858 const std::vector<Memory> &_indirect_memories);
862 template <
typename S>
865 template <
typename S>
879 const std::vector<Memory> &_indirect_memories);
885 size_t total_bytes,
const std::vector<size_t> *src_frags,
886 const std::vector<size_t> *dst_frags,
XferDesKind *kind_ret ,
887 unsigned *bw_ret ,
unsigned *lat_ret );
901 const std::vector<XferDesPortInfo> &inputs_info,
902 const std::vector<XferDesPortInfo> &outputs_info,
904 const void *fill_data,
size_t fill_size,
915 class GPUfillChannel;
920 XferDesID _guid,
const std::vector<XferDesPortInfo> &inputs_info,
921 const std::vector<XferDesPortInfo> &outputs_info,
int _priority,
922 const void *_fill_data,
size_t _fill_size,
size_t _fill_total);
941 const std::vector<XferDesPortInfo> &inputs_info,
942 const std::vector<XferDesPortInfo> &outputs_info,
944 const void *fill_data,
size_t fill_size,
960 XferDesID _guid,
const std::vector<XferDesPortInfo> &inputs_info,
961 const std::vector<XferDesPortInfo> &outputs_info,
int _priority,
991 const std::vector<XferDesPortInfo> &inputs_info,
992 const std::vector<XferDesPortInfo> &outputs_info,
994 const void *fill_data,
size_t fill_size,
995 size_t fill_total)
override;
1008 const std::vector<Channel::SupportedPath> &_paths);
1012 template <
typename S>
1015 template <
typename S>
1033#if !defined(REALM_IS_WINDOWS)
1037 const void *data,
size_t datalen);
1065 inline operator
bool(
void)
const {
return dev_ptr != 0; }
1079 if(has_ipc_handle) {
1082 return has_ipc_handle;
1084#if CUDA_VERSION >= 12030
1089 bool get_fabric_handle(CUmemFabricHandle &
handle)
const;
1099 inline CUdeviceptr
get_dptr(
void)
const {
return dev_ptr; }
1112 template <
typename T =
void>
1115 return static_cast<T *
>(host_ptr);
1137 bool shareable =
true);
1138#if CUDA_VERSION >= 11000
1152 static GPUAllocation *allocate_mmap(
GPU *gpu,
const CUmemAllocationProp &prop,
1153 size_t size, CUdeviceptr vaddr = 0,
1154 bool peer_enabled =
true);
1168 bool shareable =
true,
bool same_va =
true);
1188 bool peer_enabled =
true);
1204 bool peer_enabled =
true);
1205#if CUDA_VERSION >= 12030
1217 size_t size,
bool peer_enabled =
true,
1218 bool is_local =
false);
1222 CUresult map_allocation(
GPU *gpu, CUmemGenericAllocationHandle
handle,
size_t size,
1223 CUdeviceptr va = 0,
size_t offset = 0,
1224 bool peer_enabled =
false,
bool map_host =
false);
1226#if CUDA_VERSION >= 11000
1232 static size_t align_size(
const CUmemAllocationProp &prop,
size_t size);
1243#if CUDA_VERSION >= 11000
1251 CUdeviceptr dev_ptr = 0;
1253 void *host_ptr =
nullptr;
1257 DeleterCallback deleter =
nullptr;
1258#if CUDA_VERSION >= 11000
1260 CUmemGenericAllocationHandle mmap_handle = 0;
1263 bool owns_va =
true;
1266 bool has_ipc_handle =
false;
1268 CUipcMemHandle ipc_handle;
1275#if CUDA_VERSION < 11030
1276#define CU_GET_PROC_ADDRESS_DEFAULT 0
1280#if CUDA_VERSION < 12050
1284#if CUDA_VERSION >= 13000
1287#if !defined(cuCtxGetDevice)
1288#define cuCtxGetDevice cuCtxGetDevice_v2
1290#if !defined(cuCtxSynchronize)
1291#define cuCtxSynchronize cuCtxSynchronize_v2
1293#if !defined(cuStreamGetCtx)
1294#define cuStreamGetCtx cuStreamGetCtx_v2
1299#define CUDA_DRIVER_HAS_FNPTR(name) ((name##_fnptr) != nullptr)
1300#define CUDA_DRIVER_FNPTR(name) (assert(name##_fnptr != nullptr), name##_fnptr)
1310#define CUDA_VERSION_MIN 11080
1313#define CUDA_VERSION_COMPAT ((CUDA_VERSION / 1000) * 1000)
1315#define CUDA_DRIVER_APIS(__op__) \
1316 __op__(cuModuleGetFunction, CUDA_VERSION_MIN); \
1317 __op__(cuCtxGetDevice, CUDA_VERSION_MIN); \
1318 __op__(cuCtxEnablePeerAccess, CUDA_VERSION_MIN); \
1319 __op__(cuCtxGetFlags, CUDA_VERSION_MIN); \
1320 __op__(cuCtxGetStreamPriorityRange, CUDA_VERSION_MIN); \
1321 __op__(cuCtxPopCurrent, CUDA_VERSION_MIN); \
1322 __op__(cuCtxPushCurrent, CUDA_VERSION_MIN); \
1323 __op__(cuCtxSynchronize, CUDA_VERSION_MIN); \
1324 __op__(cuDeviceCanAccessPeer, CUDA_VERSION_MIN); \
1325 __op__(cuDeviceGet, CUDA_VERSION_MIN); \
1326 __op__(cuDeviceGetUuid, CUDA_VERSION_MIN); \
1327 __op__(cuDeviceGetAttribute, CUDA_VERSION_MIN); \
1328 __op__(cuDeviceGetCount, CUDA_VERSION_MIN); \
1329 __op__(cuDeviceGetName, CUDA_VERSION_MIN); \
1330 __op__(cuDevicePrimaryCtxRelease, CUDA_VERSION_MIN); \
1331 __op__(cuDevicePrimaryCtxRetain, CUDA_VERSION_MIN); \
1332 __op__(cuDevicePrimaryCtxSetFlags, CUDA_VERSION_MIN); \
1333 __op__(cuDeviceTotalMem, CUDA_VERSION_MIN); \
1334 __op__(cuEventCreate, CUDA_VERSION_MIN); \
1335 __op__(cuEventDestroy, CUDA_VERSION_MIN); \
1336 __op__(cuEventQuery, CUDA_VERSION_MIN); \
1337 __op__(cuEventRecord, CUDA_VERSION_MIN); \
1338 __op__(cuGetErrorName, CUDA_VERSION_MIN); \
1339 __op__(cuGetErrorString, CUDA_VERSION_MIN); \
1340 __op__(cuInit, CUDA_VERSION_MIN); \
1341 __op__(cuIpcCloseMemHandle, CUDA_VERSION_MIN); \
1342 __op__(cuIpcGetMemHandle, CUDA_VERSION_MIN); \
1343 __op__(cuIpcOpenMemHandle, CUDA_VERSION_MIN); \
1344 __op__(cuLaunchKernel, CUDA_VERSION_MIN); \
1345 __op__(cuMemAllocManaged, CUDA_VERSION_MIN); \
1346 __op__(cuMemAlloc, CUDA_VERSION_MIN); \
1347 __op__(cuMemcpy2DAsync, CUDA_VERSION_MIN); \
1348 __op__(cuMemcpy3DAsync, CUDA_VERSION_MIN); \
1349 __op__(cuMemcpyAsync, CUDA_VERSION_MIN); \
1350 __op__(cuMemcpyDtoDAsync, CUDA_VERSION_MIN); \
1351 __op__(cuMemcpyDtoH, CUDA_VERSION_MIN); \
1352 __op__(cuMemcpyDtoHAsync, CUDA_VERSION_MIN); \
1353 __op__(cuMemcpyHtoD, CUDA_VERSION_MIN); \
1354 __op__(cuMemcpyHtoDAsync, CUDA_VERSION_MIN); \
1355 __op__(cuMemFreeHost, CUDA_VERSION_MIN); \
1356 __op__(cuMemFree, CUDA_VERSION_MIN); \
1357 __op__(cuMemGetInfo, CUDA_VERSION_MIN); \
1358 __op__(cuMemHostAlloc, CUDA_VERSION_MIN); \
1359 __op__(cuMemHostGetDevicePointer, CUDA_VERSION_MIN); \
1360 __op__(cuMemHostRegister, CUDA_VERSION_MIN); \
1361 __op__(cuMemHostUnregister, CUDA_VERSION_MIN); \
1362 __op__(cuMemsetD16Async, CUDA_VERSION_MIN); \
1363 __op__(cuMemsetD2D16Async, CUDA_VERSION_MIN); \
1364 __op__(cuMemsetD2D32Async, CUDA_VERSION_MIN); \
1365 __op__(cuMemsetD2D8Async, CUDA_VERSION_MIN); \
1366 __op__(cuMemsetD32Async, CUDA_VERSION_MIN); \
1367 __op__(cuMemsetD8Async, CUDA_VERSION_MIN); \
1368 __op__(cuModuleLoadDataEx, CUDA_VERSION_MIN); \
1369 __op__(cuStreamAddCallback, CUDA_VERSION_MIN); \
1370 __op__(cuStreamCreate, CUDA_VERSION_MIN); \
1371 __op__(cuStreamCreateWithPriority, CUDA_VERSION_MIN); \
1372 __op__(cuStreamDestroy, CUDA_VERSION_MIN); \
1373 __op__(cuStreamSynchronize, CUDA_VERSION_MIN); \
1374 __op__(cuOccupancyMaxPotentialBlockSize, CUDA_VERSION_MIN); \
1375 __op__(cuOccupancyMaxPotentialBlockSizeWithFlags, CUDA_VERSION_MIN); \
1376 __op__(cuEventSynchronize, CUDA_VERSION_MIN); \
1377 __op__(cuEventElapsedTime, CUDA_VERSION_MIN); \
1378 __op__(cuOccupancyMaxActiveBlocksPerMultiprocessor, CUDA_VERSION_MIN); \
1379 __op__(cuMemAddressReserve, CUDA_VERSION_MIN); \
1380 __op__(cuMemAddressFree, CUDA_VERSION_MIN); \
1381 __op__(cuMemCreate, CUDA_VERSION_MIN); \
1382 __op__(cuMemRelease, CUDA_VERSION_MIN); \
1383 __op__(cuMemMap, CUDA_VERSION_MIN); \
1384 __op__(cuMemUnmap, CUDA_VERSION_MIN); \
1385 __op__(cuMemSetAccess, CUDA_VERSION_MIN); \
1386 __op__(cuMemGetAllocationGranularity, CUDA_VERSION_MIN); \
1387 __op__(cuMemGetAllocationPropertiesFromHandle, CUDA_VERSION_MIN); \
1388 __op__(cuMemExportToShareableHandle, CUDA_VERSION_MIN); \
1389 __op__(cuMemImportFromShareableHandle, CUDA_VERSION_MIN); \
1390 __op__(cuStreamWaitEvent, CUDA_VERSION_MIN); \
1391 __op__(cuStreamQuery, CUDA_VERSION_MIN); \
1392 __op__(cuMemGetAddressRange, CUDA_VERSION_MIN); \
1393 __op__(cuPointerGetAttributes, CUDA_VERSION_MIN); \
1394 __op__(cuDriverGetVersion, CUDA_VERSION_MIN); \
1395 __op__(cuMemAdvise, CUDA_VERSION_MIN); \
1396 __op__(cuMemPrefetchAsync, CUDA_VERSION_MIN); \
1397 __op__(cuCtxSetSharedMemConfig, CUDA_VERSION_MIN); \
1398 __op__(cuCtxSetCacheConfig, CUDA_VERSION_MIN); \
1399 __op__(cuCtxSetLimit, CUDA_VERSION_MIN); \
1400 __op__(cuCtxGetLimit, CUDA_VERSION_MIN); \
1401 __op__(cuFuncSetAttribute, CUDA_VERSION_MIN); \
1402 __op__(cuFuncSetCacheConfig, CUDA_VERSION_MIN); \
1403 __op__(cuFuncSetSharedMemConfig, CUDA_VERSION_MIN); \
1404 __op__(cuFuncGetAttribute, CUDA_VERSION_MIN); \
1405 __op__(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, CUDA_VERSION_MIN); \
1406 __op__(cuArray3DCreate, CUDA_VERSION_MIN); \
1407 __op__(cuArrayDestroy, CUDA_VERSION_MIN); \
1408 __op__(cuSurfObjectCreate, CUDA_VERSION_MIN); \
1409 __op__(cuSurfObjectDestroy, CUDA_VERSION_MIN); \
1410 __op__(cuLaunchCooperativeKernel, CUDA_VERSION_MIN); \
1411 __op__(cuModuleGetGlobal, CUDA_VERSION_MIN); \
1412 __op__(cuLaunchHostFunc, CUDA_VERSION_MIN); \
1413 __op__(cuCtxRecordEvent, 12050); \
1414 __op__(cuArrayGetMemoryRequirements, CUDA_VERSION_MIN);
1417#define DECL_FNPTR_EXTERN(name, ver) extern decltype(&name) name##_fnptr;
1419#undef DECL_FNPTR_EXTERN
1421#define NVML_FNPTR(name) (name##_fnptr)
1423#if NVML_API_VERSION >= 11
1424#define NVML_11_APIS(__op__) __op__(nvmlDeviceGetMemoryAffinity);
1426#define NVML_11_APIS(__op__)
1429#if NVML_API_VERSION >= 12
1430#define NVML_12_APIS(__op__) __op__(nvmlDeviceGetGpuFabricInfo)
1432#define NVML_12_APIS(__op__)
1435#if CUDA_VERSION < 11040
1456#define NVML_APIS(__op__) \
1458 __op__(nvmlDeviceGetHandleByUUID); \
1459 __op__(nvmlDeviceGetMaxPcieLinkWidth); \
1460 __op__(nvmlDeviceGetMaxPcieLinkGeneration); \
1461 __op__(nvmlDeviceGetNvLinkState); \
1462 __op__(nvmlDeviceGetNvLinkVersion); \
1463 __op__(nvmlDeviceGetNvLinkRemotePciInfo); \
1464 __op__(nvmlDeviceGetNvLinkRemoteDeviceType); \
1465 __op__(nvmlDeviceGetDeviceHandleFromMigDeviceHandle); \
1466 __op__(nvmlDeviceGetFieldValues); \
1467 NVML_11_APIS(__op__); \
1468 NVML_12_APIS(__op__);
1470#define DECL_FNPTR_EXTERN(name) extern decltype(&name) name##_fnptr;
1472#undef DECL_FNPTR_EXTERN
1474#define CUPTI_APIS(__op__) \
1475 __op__(cuptiActivityRegisterCallbacks); \
1476 __op__(cuptiActivityEnable); \
1477 __op__(cuptiActivityDisable); \
1478 __op__(cuptiActivityEnableContext); \
1479 __op__(cuptiActivityDisableContext); \
1480 __op__(cuptiActivityFlushAll); \
1481 __op__(cuptiActivityGetNextRecord); \
1482 __op__(cuptiActivityRegisterTimestampCallback); \
1483 __op__(cuptiActivityPushExternalCorrelationId); \
1484 __op__(cuptiActivityPopExternalCorrelationId);
1486#define DECL_FNPTR_EXTERN(name) extern decltype(&name) name##_fnptr;
1488#undef DECL_FNPTR_EXTERN
1490#define CUPTI_HAS_FNPTR(name) (name##_fnptr != nullptr)
1491#define CUPTI_FNPTR(name) (assert(name##_fnptr != nullptr), name##_fnptr)
bootstrap_handle_t * handle
Definition bootstrap.h:61
Definition bytearray.h:30
Definition bytearray.h:53
Definition circ_queue.h:35
Definition codedesc.h:249
Definition cuda_internal.h:766
int dim
Definition cuda_internal.h:774
CUarray array
Definition cuda_internal.h:773
size_t width_in_bytes
Definition cuda_internal.h:776
size_t height
Definition cuda_internal.h:776
size_t pos[3]
Definition cuda_internal.h:775
size_t depth
Definition cuda_internal.h:776
virtual int set_rect(const RegionInstanceImpl *inst, const InstanceLayoutPieceBase *piece, size_t field_size, size_t field_offset, int ndims, const int64_t lo[], const int64_t hi[], const int order[])
Definition cuda_internal.h:528
AutoGPUContext(GPU *_gpu)
AutoGPUContext(GPU &_gpu)
GPU * gpu
Definition cuda_internal.h:535
Definition cuda_internal.h:360
Mutex mutex
Definition cuda_internal.h:376
std::vector< Thread * > worker_threads
Definition cuda_internal.h:381
int total_threads
Definition cuda_internal.h:380
CoreReservation * core_rsrv
Definition cuda_internal.h:382
int max_threads
Definition cuda_internal.h:375
void add_fence(GPUWorkFence *fence)
ContextSynchronizer(GPU *_gpu, CUcontext _context, CoreReservationSet &crs, int _max_threads)
GPU * gpu
Definition cuda_internal.h:373
int syncing_threads
Definition cuda_internal.h:380
Mutex::CondVar condvar
Definition cuda_internal.h:377
CUcontext context
Definition cuda_internal.h:374
int sleeping_threads
Definition cuda_internal.h:380
bool shutdown_flag
Definition cuda_internal.h:378
GPUWorkFence::FenceList fences
Definition cuda_internal.h:379
Definition cuda_internal.h:577
CudaDeviceMemoryInfo(CUcontext _context)
GPU * gpu
Definition cuda_internal.h:582
CUcontext context
Definition cuda_internal.h:581
Definition cuda_module.h:165
Class for managing the lifetime of a given gpu allocation. As instances of this class own an underlyi...
Definition cuda_internal.h:1054
static GPUAllocation * open_handle(GPU *gpu, OsHandle hdl, size_t size, bool peer_enabled=true)
Retrieves the GPUAllocation given the OsHandle.
static GPUAllocation * register_allocation(GPU *gpu, void *ptr, size_t size, bool peer_enabled=true)
Create an allocation that registers the given CPU address range with CUDA, making it accessible from ...
static GPUAllocation * allocate_host(GPU *gpu, size_t size, bool peer_enabled=true, bool shareable=true, bool same_va=true)
Allocate CPU-located memory for the given gpu with the given size and features.
T * get_hptr(void) const
Retrieves the CPU accessible base address for the allocation, or nullptr if there is no way to access...
Definition cuda_internal.h:1113
static GPUAllocation * allocate_managed(GPU *gpu, size_t size)
Allocate migratable memory that can be used with CUDA's managed memory APIs (cuMemPrefetchAsync,...
GPUAllocation(const GPUAllocation &)=delete
size_t get_size(void) const
Retrieves the given size of the allocation.
Definition cuda_internal.h:1105
OsHandle get_os_handle(void) const
Accessor for the file descriptor or win32 HANDLE associated with the allocation. This handle can be s...
bool get_ipc_handle(CUipcMemHandle &handle) const
Retrieves the CUipcMemHandle for this allocation that can be used with GPUAllocation::open_ipc.
Definition cuda_internal.h:1077
static GPUAllocation * allocate_dev(GPU *gpu, size_t size, bool peer_enabled=true, bool shareable=true)
Allocates device-located memory for the given gpu with the given size and features.
GPUAllocation & operator=(GPUAllocation &&) noexcept
GPUAllocation(void)=default
static void * get_win32_shared_attributes(void)
Retrieves the default win32 shared attributes for creating a shared object that can be set in CUmemAl...
static GPUAllocation * open_ipc(GPU *gpu, const CUipcMemHandle &mem_hdl)
Retrieves the GPUAllocation given the CUipcMemHandle.
GPUAllocation(GPUAllocation &&other) noexcept
CUdeviceptr get_dptr(void) const
Retrieves the base CUdeviceptr for the associated allocation that can be used to access the underlyin...
Definition cuda_internal.h:1099
GPU * get_gpu(void) const
Retrieves the owning GPU.
Definition cuda_internal.h:1102
Definition cuda_internal.h:890
GPUChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork)
static const bool is_ordered
Definition cuda_internal.h:897
long submit(Request **requests, long nr)
GPU * get_gpu() const
Definition cuda_internal.h:908
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
Definition cuda_internal.h:703
GPURequest * req
Definition cuda_internal.h:707
void request_completed(void)
Definition cuda_internal.h:196
virtual ~GPUCompletionNotification(void)
Definition cuda_internal.h:198
virtual void request_completed(void)=0
Definition cuda_internal.h:183
void destroy_context(InternalTask *task, void *context) const override
void destroy_context(Task *task, void *context) const override
GPU * gpu
Definition cuda_internal.h:190
void * create_context(Task *task) const override
GPUContextManager(GPU *_gpu, GPUProcessor *proc)
void * create_context(InternalTask *task) const override
GPUProcessor * proc
Definition cuda_internal.h:191
Definition cuda_internal.h:616
GPU * gpu
Definition cuda_internal.h:652
virtual void release_storage_immediate(RegionInstanceImpl *inst, bool poisoned, TimeLimit work_until)
size_t cur_size
Definition cuda_internal.h:654
NetworkSegment local_segment
Definition cuda_internal.h:656
virtual void * get_direct_ptr(off_t offset, size_t size)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
virtual AllocationResult allocate_storage_immediate(RegionInstanceImpl *inst, bool need_alloc_result, bool poisoned, TimeLimit work_until)
GPUDynamicFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, size_t _max_size)
Mutex mutex
Definition cuda_internal.h:653
std::map< RegionInstance, std::pair< CUdeviceptr, size_t > > alloc_bases
Definition cuda_internal.h:655
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void get_bytes(off_t offset, void *dst, size_t size)
virtual ~GPUDynamicFBMemory(void)
virtual void put_bytes(off_t offset, const void *src, size_t size)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
Definition cuda_internal.h:337
CUevent get_event(bool external=false)
Mutex mutex
Definition cuda_internal.h:351
void init_pool(int init_size=0)
int batch_size
Definition cuda_internal.h:352
void return_event(CUevent e, bool external=false)
std::vector< CUevent > available_events
Definition cuda_internal.h:353
GPUEventPool(int _batch_size=256)
int total_size
Definition cuda_internal.h:352
int current_size
Definition cuda_internal.h:352
int external_count
Definition cuda_internal.h:352
Definition cuda_internal.h:690
GPUFBIBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base, size_t _size)
NetworkSegment local_segment
Definition cuda_internal.h:698
CUdeviceptr base
Definition cuda_internal.h:697
GPU * gpu
Definition cuda_internal.h:696
Definition cuda_internal.h:585
NetworkSegment local_segment
Definition cuda_internal.h:613
GPU * gpu
Definition cuda_internal.h:611
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void * get_direct_ptr(off_t offset, size_t size)
CUdeviceptr base
Definition cuda_internal.h:612
virtual void put_bytes(off_t offset, const void *src, size_t size)
GPUFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base, size_t _size)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
virtual void get_bytes(off_t offset, void *dst, size_t size)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
virtual ~GPUFBMemory(void)
Definition cuda_internal.h:814
GPU * get_gpu() const
Definition cuda_internal.h:846
static const bool is_ordered
Definition cuda_internal.h:821
GPU * src_gpu
Definition cuda_internal.h:850
virtual bool supports_indirection_memory(Memory mem) const
Queries if a given mem can be used as an indirection buffer.
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
virtual Memory suggest_ib_memories() const
GPUIndirectChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork)
long submit(Request **requests, long nr)
virtual RemoteChannelInfo * construct_remote_info() const
virtual bool needs_wrapping_iterator() const
virtual uint64_t supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id, CustomSerdezID dst_serdez_id, ReductionOpID redop_id, size_t total_bytes, const std::vector< size_t > *src_frags, const std::vector< size_t > *dst_frags, XferDesKind *kind_ret=0, unsigned *bw_ret=0, unsigned *lat_ret=0)
Definition cuda_internal.h:853
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPUIndirectRemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:871
bool serialize(S &serializer) const
GPUIndirectRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths, const std::vector< Memory > &_indirect_memories)
virtual RemoteChannel * create_remote_channel()
static RemoteChannelInfo * deserialize_new(S &deserializer)
Definition cuda_internal.h:874
virtual bool needs_wrapping_iterator() const
virtual Memory suggest_ib_memories() const
virtual uint64_t supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id, CustomSerdezID dst_serdez_id, ReductionOpID redop_id, size_t total_bytes, const std::vector< size_t > *src_frags, const std::vector< size_t > *dst_frags, XferDesKind *kind_ret, unsigned *bw_ret, unsigned *lat_ret)
GPUIndirectRemoteChannel(uintptr_t _remote_ptr, const std::vector< Memory > &_indirect_memories)
Definition cuda_internal.h:719
size_t read_ind_offset
Definition cuda_internal.h:735
size_t write_size
Definition cuda_internal.h:737
size_t write_offset
Definition cuda_internal.h:737
int write_ind_port_idx
Definition cuda_internal.h:738
size_t read_offset
Definition cuda_internal.h:733
size_t write_ind_offset
Definition cuda_internal.h:739
size_t read_ind_size
Definition cuda_internal.h:735
GPUIndirectTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size, int _write_port_idx, size_t _write_offset, size_t _write_size, int _read_ind_port_idx=-1, size_t _read_ind_offset=0, size_t _read_ind_size=0, int _write_ind_port_idx=-1, size_t _write_ind_offset=0, size_t _write_ind_size=0)
int write_port_idx
Definition cuda_internal.h:736
virtual void request_completed(void)
size_t write_ind_size
Definition cuda_internal.h:739
int read_ind_port_idx
Definition cuda_internal.h:734
XferDes * xd
Definition cuda_internal.h:731
size_t read_size
Definition cuda_internal.h:733
int read_port_idx
Definition cuda_internal.h:732
Definition cuda_internal.h:798
bool progress_xd(GPUIndirectChannel *channel, TimeLimit work_until)
std::vector< bool > dst_is_ipc
Definition cuda_internal.h:810
std::vector< GPU * > dst_gpus
Definition cuda_internal.h:809
long get_requests(Request **requests, long nr)
std::vector< GPU * > src_gpus
Definition cuda_internal.h:809
GPUIndirectXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, XferDesRedopInfo _redop_info)
Definition cuda_internal.h:539
virtual void shutdown(void)
Realm::CoreReservation * core_rsrv
Definition cuda_internal.h:559
virtual bool register_task(Processor::TaskFuncID func_id, CodeDescriptor &codedesc, const ByteArrayRef &user_data)
virtual ~GPUProcessor(void)
GPU * gpu
Definition cuda_internal.h:556
GPUProcessor(RuntimeImpl *runtime_impl, GPU *_gpu, Processor _me, Realm::CoreReservationSet &crs, size_t _stack_size)
std::map< Processor::TaskFuncID, GPUTaskTableEntry > gpu_task_table
Definition cuda_internal.h:569
virtual void execute_task(Processor::TaskFuncID func_id, const ByteArrayRef &task_args)
Definition cuda_internal.h:1040
GPUReplHeapListener(CudaModule *_module)
virtual void chunk_created(void *base, size_t bytes)
virtual void chunk_destroyed(void *base, size_t bytes)
Definition cuda_internal.h:710
GPUCompletionEvent event
Definition cuda_internal.h:716
void * dst_base
Definition cuda_internal.h:713
const void * src_base
Definition cuda_internal.h:712
GPU * dst_gpu
Definition cuda_internal.h:715
Definition cuda_internal.h:247
bool ok_to_submit_copy(size_t bytes, XferDes *xd)
REALM_INTERNAL_API_EXTERNAL_LINKAGE CUstream get_stream(void) const
void add_notification(GPUCompletionNotification *notification)
void add_event(CUevent event, GPUWorkFence *fence, GPUCompletionNotification *notification=NULL, GPUWorkStart *start=NULL)
Mutex mutex
Definition cuda_internal.h:281
void add_start_event(GPUWorkStart *start)
bool has_work(void) const
GPU * gpu
Definition cuda_internal.h:276
GPU * get_gpu(void) const
GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority=0)
GPUWorker * worker
Definition cuda_internal.h:277
bool reap_events(TimeLimit work_until)
void add_fence(GPUWorkFence *fence)
std::deque< PendingEvent > pending_events
Definition cuda_internal.h:291
void wait_on_streams(const std::set< GPUStream * > &other_streams)
CUstream stream
Definition cuda_internal.h:279
Definition cuda_internal.h:742
size_t read_offset
Definition cuda_internal.h:753
int write_port_idx
Definition cuda_internal.h:754
XferDes * xd
Definition cuda_internal.h:751
size_t write_size
Definition cuda_internal.h:755
size_t write_offset
Definition cuda_internal.h:755
GPUTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size, int _write_port_idx, size_t _write_offset, size_t _write_size)
virtual void request_completed(void)
size_t read_size
Definition cuda_internal.h:753
int read_port_idx
Definition cuda_internal.h:752
Definition cuda_internal.h:203
virtual void request_cancellation(void)
IntrusiveListLink< GPUWorkFence > fence_list_link
Definition cuda_internal.h:216
GPU * gpu
Definition cuda_internal.h:224
void enqueue_on_stream(GPUStream *stream)
virtual void print(std::ostream &os) const
IntrusiveList< GPUWorkFence, REALM_PMTA_USE(GPUWorkFence, fence_list_link), DummyLock > FenceList
Definition cuda_internal.h:220
GPUWorkFence(GPU *gpu, Realm::Operation *op)
REALM_PMTA_DEFN(GPUWorkFence, IntrusiveListLink< GPUWorkFence >, fence_list_link)
static void cuda_callback(CUstream stream, CUresult res, void *data)
virtual void mark_finished(bool successful)
Definition cuda_internal.h:227
void mark_gpu_work_start()
void enqueue_on_stream(GPUStream *stream)
GPUWorkStart(Realm::Operation *op)
virtual void request_cancellation(void)
Definition cuda_internal.h:231
virtual void print(std::ostream &os) const
static void cuda_start_callback(CUstream stream, CUresult res, void *data)
Definition cuda_internal.h:298
bool process_streams(bool sleep_on_empty)
CircularQueue< GPUStream *, 16 > ActiveStreamQueue
Definition cuda_internal.h:325
void shutdown_background_thread(void)
ActiveStreamQueue active_streams
Definition cuda_internal.h:326
void add_stream(GPUStream *s)
void start_background_thread(Realm::CoreReservationSet &crs, size_t stack_size)
Realm::CoreReservation * core_rsrv
Definition cuda_internal.h:329
Mutex::CondVar condvar
Definition cuda_internal.h:323
Mutex lock
Definition cuda_internal.h:322
atomic< bool > worker_shutdown_requested
Definition cuda_internal.h:332
bool do_work(TimeLimit work_until)
bool thread_sleeping
Definition cuda_internal.h:331
Realm::Thread * worker_thread
Definition cuda_internal.h:330
Definition cuda_internal.h:781
bool progress_xd(GPUChannel *channel, TimeLimit work_until)
GPUXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority)
long get_requests(Request **requests, long nr)
Definition cuda_internal.h:659
NetworkSegment local_segment
Definition cuda_internal.h:687
virtual ~GPUZCMemory(void)
char * cpu_base
Definition cuda_internal.h:686
virtual void * get_direct_ptr(off_t offset, size_t size)
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
GPUZCMemory(RuntimeImpl *_runtime_impl, GPU *gpu, Memory _me, CUdeviceptr _gpu_base, void *_cpu_base, size_t _size, MemoryKind _kind, Memory::Kind _lowlevel_kind)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
virtual void put_bytes(off_t offset, const void *src, size_t size)
CUdeviceptr gpu_base
Definition cuda_internal.h:685
virtual void get_bytes(off_t offset, void *dst, size_t size)
Definition cuda_internal.h:392
void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size)
void launch_transpose_kernel(MemcpyTransposeInfo< size_t > ©_info, size_t elemSize, GPUStream *stream)
void create_dynamic_fb_memory(RuntimeImpl *runtime, size_t max_size)
void launch_batch_affine_kernel(void *copy_info, size_t dim, size_t elemSize, size_t volume, GPUStream *stream)
bool register_reduction(ReductionOpID redop_id, CUfunction apply_excl, CUfunction apply_nonexcl, CUfunction fold_excl, CUfunction fold_nonexcl)
CUdeviceptr fbmem_base
Definition cuda_internal.h:475
GPUFuncInfo fill_affine_large_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:472
std::vector< CudaIpcMapping > cudaipc_mappings
Definition cuda_internal.h:512
bool can_access_peer(const GPU *peer) const
GPUFBMemory * fbmem
Definition cuda_internal.h:444
GPUFuncInfo transpose_kernels[CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:473
std::unordered_map< ReductionOpID, GPUReductionOpEntry > gpu_reduction_table
Definition cuda_internal.h:524
REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUStream * get_null_task_stream(void) const
GPUStream * device_to_host_stream
Definition cuda_internal.h:490
GPUProcessor * proc
Definition cuda_internal.h:441
GPUFuncInfo batch_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:468
std::vector< GPUStream * > task_streams
Definition cuda_internal.h:494
ContextSynchronizer ctxsync
Definition cuda_internal.h:437
CUmodule device_module
Definition cuda_internal.h:450
void create_processor(RuntimeImpl *runtime, size_t stack_size)
Mutex alloc_mutex
Definition cuda_internal.h:514
std::set< Memory > managed_mems
Definition cuda_internal.h:483
GPUStream * host_to_device_stream
Definition cuda_internal.h:489
GPUStream * get_next_d2d_stream()
GPUFuncInfo indirect_copy_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:467
GPUStream * device_to_device_stream
Definition cuda_internal.h:491
size_t cupti_activity_refcount
Definition cuda_internal.h:497
std::map< NodeID, GPUStream * > cudaipc_streams
Definition cuda_internal.h:513
std::map< CUdeviceptr, GPUAllocation > allocations
Definition cuda_internal.h:443
std::set< Memory > pinned_sysmems
Definition cuda_internal.h:480
GPUFuncInfo batch_fill_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:470
static const size_t CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES
Definition cuda_internal.h:464
GPUStream * get_next_task_stream(bool create=false)
void launch_indirect_copy_kernel(void *copy_info, size_t dim, size_t addr_size, size_t field_size, size_t volume, GPUStream *stream)
CUmodule load_cuda_module(const void *data)
int least_stream_priority
Definition cuda_internal.h:503
CUdeviceptr fb_ibmem_base
Definition cuda_internal.h:477
GPUAllocation & add_allocation(GPUAllocation &&alloc)
GPUDynamicFBMemory * fb_dmem
Definition cuda_internal.h:445
void launch_batch_affine_fill_kernel(void *fill_info, size_t dim, size_t elemSize, size_t volume, GPUStream *stream)
int greatest_stream_priority
Definition cuda_internal.h:503
const CudaIpcMapping * find_ipc_mapping(Memory mem) const
GPUEventPool event_pool
Definition cuda_internal.h:499
CUcontext context
Definition cuda_internal.h:448
GPU(CudaModule *_module, GPUInfo *_info, GPUWorker *worker, CUcontext _context)
std::vector< GPUStream * > device_to_device_streams
Definition cuda_internal.h:492
GPUFBIBMemory * fb_ibmem
Definition cuda_internal.h:446
atomic< unsigned > next_task_stream
Definition cuda_internal.h:495
GPUStream * find_stream(CUstream stream) const
bool is_accessible_host_mem(const MemoryImpl *mem) const
GPUInfo * info
Definition cuda_internal.h:439
std::vector< GPUStream * > peer_to_peer_streams
Definition cuda_internal.h:493
GPUWorker * worker
Definition cuda_internal.h:440
void create_dma_channels(Realm::RuntimeImpl *r)
std::set< Memory > peer_fbs
Definition cuda_internal.h:486
atomic< unsigned > next_d2d_stream
Definition cuda_internal.h:496
bool is_accessible_gpu_mem(const MemoryImpl *mem) const
Definition cuda_internal.h:932
static const bool is_ordered
Definition cuda_internal.h:937
GPUfillChannel(GPU *_gpu, BackgroundWorkManager *bgwork)
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
long submit(Request **requests, long nr)
GPU * gpu
Definition cuda_internal.h:952
Definition cuda_internal.h:917
GPUfillXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, const void *_fill_data, size_t _fill_size, size_t _fill_total)
size_t reduced_fill_size
Definition cuda_internal.h:929
long get_requests(Request **requests, long nr)
bool progress_xd(GPUfillChannel *channel, TimeLimit work_until)
Definition cuda_internal.h:978
GPU * gpu
Definition cuda_internal.h:1002
RemoteChannelInfo * construct_remote_info() const override
static const bool is_ordered
Definition cuda_internal.h:983
XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total) override
long submit(Request **requests, long nr) override
bool supports_redop(ReductionOpID redop_id) const override
GPUreduceChannel(GPU *_gpu, BackgroundWorkManager *bgwork)
Definition cuda_internal.h:1005
GPUreduceRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths)
bool serialize(S &serializer) const
virtual RemoteChannel * create_remote_channel()
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPUreduceRemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:1021
static RemoteChannelInfo * deserialize_new(S &deserializer)
Definition cuda_internal.h:1024
Definition cuda_internal.h:957
long get_requests(Request **requests, long nr)
std::vector< bool > src_is_ipc
Definition cuda_internal.h:975
GPUStream * stream
Definition cuda_internal.h:973
GPUreduceXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, XferDesRedopInfo _redop_info)
const void * kernel_host_proxy
Definition cuda_internal.h:972
bool progress_xd(GPUreduceChannel *channel, TimeLimit work_until)
CUfunction kernel
Definition cuda_internal.h:971
XferDesRedopInfo redop_info
Definition cuda_internal.h:969
std::vector< GPU * > src_gpus
Definition cuda_internal.h:974
const ReductionOpUntyped * redop
Definition cuda_internal.h:970
Definition cuda_internal.h:758
MemSpecificCudaArray(CUarray _array)
CUarray array
Definition cuda_internal.h:763
virtual ~MemSpecificCudaArray()
Definition instance.h:405
Definition ib_memory.h:30
Definition indexspace.h:1115
Definition inst_layout.h:266
Definition mem_impl.h:344
Definition proc_impl.h:141
Definition mem_impl.h:212
MemoryKind
Definition mem_impl.h:53
size_t size
Definition mem_impl.h:195
AllocationResult
Definition mem_impl.h:89
Kind
Definition memory.h:59
Definition operation.h:75
Operation * op
Definition operation.h:87
Definition operation.h:32
Definition processor.h:37
::realm_task_func_id_t TaskFuncID
Definition processor.h:58
Definition inst_impl.h:54
Definition repl_heap.h:50
Definition runtime_impl.h:264
Definition serialize.h:363
Definition channel.h:1014
Channel * channel
Definition channel.h:343
#define REALM_INTERNAL_API_EXTERNAL_LINKAGE
Definition compiler_support.h:218
#define CUDA_DRIVER_APIS(__op__)
Definition cuda_internal.h:1315
#define NVML_APIS(__op__)
Definition cuda_internal.h:1456
#define DECL_FNPTR_EXTERN(name, ver)
Definition cuda_internal.h:1417
#define CUPTI_APIS(__op__)
Definition cuda_internal.h:1474
#define cudaDeviceProp
Definition hip_cuda.h:24
#define REALM_PMTA_USE(structtype, name)
Definition lists.h:42
CudaModule * cuda_module_singleton
CUresult cuGetProcAddress(const char *, void **, int, int)
CUresult cuCtxRecordEvent(CUcontext hctx, CUevent event)
GPUMemcpyKind
Definition cuda_internal.h:162
@ GPU_MEMCPY_PEER_TO_PEER
Definition cuda_internal.h:166
@ GPU_MEMCPY_HOST_TO_DEVICE
Definition cuda_internal.h:163
@ GPU_MEMCPY_DEVICE_TO_HOST
Definition cuda_internal.h:164
@ GPU_MEMCPY_DEVICE_TO_DEVICE
Definition cuda_internal.h:165
nvmlReturn_t nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType)
nvmlIntNvLinkDeviceType_enum
Definition cuda_internal.h:1444
@ NVML_NVLINK_DEVICE_TYPE_IBMNPU
Definition cuda_internal.h:1446
@ NVML_NVLINK_DEVICE_TYPE_SWITCH
Definition cuda_internal.h:1447
@ NVML_NVLINK_DEVICE_TYPE_UNKNOWN
Definition cuda_internal.h:1448
@ NVML_NVLINK_DEVICE_TYPE_GPU
Definition cuda_internal.h:1445
enum Realm::Cuda::nvmlIntNvLinkDeviceType_enum nvmlIntNvLinkDeviceType_t
Definition activemsg.h:38
int NodeID
Definition nodeset.h:40
XferDesKind
Definition channel.h:85
int CustomSerdezID
Definition custom_serdez.h:148
int OsHandle
Definition utils.h:399
unsigned long long XferDesID
Definition channel.h:57
::realm_reduction_op_id_t ReductionOpID
Definition event.h:38
#define REALM_MAX_DIM
Definition realm_config.h:34
Definition cuda_internal.h:1031
static void handle_message(NodeID sender, const CudaIpcImportRequest &args, const void *data, size_t datalen)
unsigned count
Definition cuda_internal.h:1032
long hostid
Definition cuda_internal.h:1034
Definition cudart_hijack.h:53
Definition cuda_internal.h:127
int pci_busid
Definition cuda_internal.h:141
CUdevice device
Definition cuda_internal.h:129
size_t pci_bandwidth
Definition cuda_internal.h:145
std::vector< size_t > logical_peer_bandwidth
Definition cuda_internal.h:148
int pci_domainid
Definition cuda_internal.h:142
CUuuid uuid
Definition cuda_internal.h:131
std::set< CUdevice > peers
Definition cuda_internal.h:140
bool has_numa_preference
Definition cuda_internal.h:138
bool pageable_access_supported
Definition cuda_internal.h:154
std::vector< size_t > logical_peer_latency
Definition cuda_internal.h:149
bool host_gpu_same_va
Definition cuda_internal.h:147
unsigned fabric_clique
Definition cuda_internal.h:152
bool fabric_supported
Definition cuda_internal.h:151
char name[MAX_NAME_LEN]
Definition cuda_internal.h:135
int major
Definition cuda_internal.h:132
size_t totalGlobalMem
Definition cuda_internal.h:136
int pci_deviceid
Definition cuda_internal.h:143
nvmlDevice_t nvml_dev
Definition cuda_internal.h:130
unsigned long numa_node_affinity[MAX_NUMA_NODE_LEN]
Definition cuda_internal.h:139
size_t c2c_bandwidth
Definition cuda_internal.h:144
int index
Definition cuda_internal.h:128
int minor
Definition cuda_internal.h:133
size_t nvswitch_bandwidth
Definition cuda_internal.h:146
CUuuid fabric_uuid
Definition cuda_internal.h:153
static const size_t MAX_NAME_LEN
Definition cuda_internal.h:134
static const size_t MAX_NUMA_NODE_LEN
Definition cuda_internal.h:137
Definition cuda_internal.h:561
Cuda::StreamAwareTaskFuncPtr stream_aware_fnptr
Definition cuda_internal.h:563
Processor::TaskFuncPtr fnptr
Definition cuda_internal.h:562
ByteArray user_data
Definition cuda_internal.h:564
Definition cuda_internal.h:282
GPUWorkStart * start
Definition cuda_internal.h:285
CUevent event
Definition cuda_internal.h:283
GPUWorkFence * fence
Definition cuda_internal.h:284
GPUCompletionNotification * notification
Definition cuda_internal.h:286
Definition cuda_internal.h:505
uintptr_t address_offset
Definition cuda_internal.h:510
NodeID owner
Definition cuda_internal.h:506
GPU * src_gpu
Definition cuda_internal.h:507
Memory mem
Definition cuda_internal.h:508
uintptr_t local_base
Definition cuda_internal.h:509
Definition cuda_internal.h:452
CUfunction func
Definition cuda_internal.h:453
int occ_num_threads
Definition cuda_internal.h:454
int occ_num_blocks
Definition cuda_internal.h:455
Definition cuda_internal.h:517
CUfunction fold_excl
Definition cuda_internal.h:521
CUfunction apply_nonexcl
Definition cuda_internal.h:518
CUfunction fold_nonexcl
Definition cuda_internal.h:520
CUfunction apply_excl
Definition cuda_internal.h:519
Definition cuda_memcpy.h:109
Definition cudart_hijack.h:65
Definition cudart_hijack.h:76
NodeID src
Definition ucp_internal.h:1