18#ifndef REALM_CUDA_INTERNAL_H
19#define REALM_CUDA_INTERNAL_H
24#include <unordered_map>
25#if !defined(CUDA_ENABLE_DEPRECATED)
27#define CUDA_ENABLE_DEPRECATED 1
32#if defined(REALM_USE_CUDART_HIJACK)
33#include <cuda_runtime_api.h>
37#include <vector_types.h>
50#if CUDART_VERSION < 11000
51#define CHECK_CUDART(cmd) \
53 int ret = (int)(cmd); \
55 fprintf(stderr, "CUDART: %s = %d\n", #cmd, ret); \
62#define CHECK_CUDART(cmd) CHECK_CU((CUresult)(cmd))
66#if CUDA_VERSION >= 6050
67#define REPORT_CU_ERROR(level, cmd, ret) \
69 const char *name, *str; \
70 CUDA_DRIVER_FNPTR(Realm::Cuda::cuGetErrorName)(ret, &name); \
71 CUDA_DRIVER_FNPTR(Realm::Cuda::cuGetErrorString)(ret, &str); \
72 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret \
73 << '(' << name << "): " << str; \
76#define REPORT_CU_ERROR(level, cmd, ret) \
78 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret \
82#define CHECK_CU(cmd) \
84 CUresult ret = (cmd); \
85 if(ret != CUDA_SUCCESS) { \
86 REPORT_CU_ERROR(Logger::LEVEL_ERROR, #cmd, ret); \
91#define REPORT_NVML_ERROR(level, cmd, ret) \
93 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret; \
96#define CHECK_NVML(cmd) \
98 nvmlReturn_t ret = (cmd); \
99 if(ret != NVML_SUCCESS) { \
100 REPORT_NVML_ERROR(Logger::LEVEL_ERROR, #cmd, ret); \
105#define IS_DEFAULT_STREAM(stream) \
106 (((stream) == 0) || ((stream) == CU_STREAM_LEGACY) || \
107 ((stream) == CU_STREAM_PER_THREAD))
109#define REPORT_CUPTI_ERROR(level, cmd, ret) \
111 log_gpu.newmsg(level) << __FILE__ << '(' << __LINE__ << "):" << cmd << " = " << ret; \
114#define CHECK_CUPTI(cmd) \
116 CUptiResult ret = (cmd); \
117 if(ret != CUPTI_SUCCESS) { \
118 REPORT_CUPTI_ERROR(Logger::LEVEL_ERROR, #cmd, ret); \
156#ifdef REALM_USE_CUDART_HIJACK
174 class GPUDynamicFBMemory;
214 virtual void print(std::ostream &os)
const;
235 virtual void print(std::ostream &os)
const;
419 size_t volume,
bool multified_optimized,
425 size_t field_size,
size_t volume,
431 ReductionOpID redop_id, CUfunction apply_excl, CUfunction apply_nonexcl,
432 CUfunction fold_excl, CUfunction fold_nonexcl, CUfunction apply_excl_advanced,
433 CUfunction apply_nonexcl_advanced, CUfunction fold_excl_advanced,
434 CUfunction fold_nonexcl_advanced, CUfunction apply_excl_transpose,
435 CUfunction apply_nonexcl_transpose, CUfunction fold_excl_transpose,
436 CUfunction fold_nonexcl_transpose);
616 size_t &inst_offset);
641 bool need_alloc_result,
657 size_t &inst_offset);
670 std::map<RegionInstance, std::pair<CUdeviceptr, size_t>>
alloc_bases;
677 void *_cpu_base,
size_t _size,
MemoryKind _kind,
690 size_t &inst_offset);
737 XferDes *_xd,
int _read_port_idx,
size_t _read_offset,
size_t _read_size,
738 int _write_port_idx,
size_t _write_offset,
size_t _write_size,
739 int _read_ind_port_idx = -1,
size_t _read_ind_offset = 0,
740 size_t _read_ind_size = 0,
int _write_ind_port_idx = -1,
741 size_t _write_ind_offset = 0,
size_t _write_ind_size = 0);
760 size_t _read_size,
int _write_port_idx,
size_t _write_offset,
785 size_t field_offset,
int ndims,
const int64_t lo[],
786 const int64_t hi[],
const int order[]);
799 XferDesID _guid,
const std::vector<XferDesPortInfo> &inputs_info,
800 const std::vector<XferDesPortInfo> &outputs_info,
int _priority);
810 size_t bytes_left,
size_t max_xfer_fields,
811 size_t &fields_total);
814 std::vector<GPU *> src_gpus, dst_gpus;
815 std::vector<bool> dst_is_ipc;
820 static constexpr size_t min_xfer_size = 4 << 20;
826 static constexpr size_t max_xfer_size = 4ULL * 1024ULL * 1024ULL * 1024ULL;
827 static constexpr size_t max_xfer_fields = 2000;
835 XferDesID _guid,
const std::vector<XferDesPortInfo> &inputs_info,
836 const std::vector<XferDesPortInfo> &outputs_info,
int _priority,
865 size_t total_bytes,
const std::vector<size_t> *src_frags,
866 const std::vector<size_t> *dst_frags,
XferDesKind *kind_ret = 0,
867 unsigned *bw_ret = 0,
unsigned *lat_ret = 0);
873 const std::vector<XferDesPortInfo> &inputs_info,
874 const std::vector<XferDesPortInfo> &outputs_info,
876 const void *fill_data,
size_t fill_size,
890 uintptr_t _remote_ptr,
891 const std::vector<Channel::SupportedPath> &_paths,
892 const std::vector<Memory> &_indirect_memories);
896 template <
typename S>
899 template <
typename S>
913 const std::vector<Memory> &_indirect_memories);
919 size_t total_bytes,
const std::vector<size_t> *src_frags,
920 const std::vector<size_t> *dst_frags,
XferDesKind *kind_ret ,
921 unsigned *bw_ret ,
unsigned *lat_ret );
935 const std::vector<XferDesPortInfo> &inputs_info,
936 const std::vector<XferDesPortInfo> &outputs_info,
938 const void *fill_data,
size_t fill_size,
958 const std::vector<Channel::SupportedPath> &_paths);
962 template <
typename S>
965 template <
typename S>
986 class GPUfillChannel;
991 XferDesID _guid,
const std::vector<XferDesPortInfo> &inputs_info,
992 const std::vector<XferDesPortInfo> &outputs_info,
int _priority,
993 const void *_fill_data,
size_t _fill_size,
size_t _fill_total);
1012 const std::vector<XferDesPortInfo> &inputs_info,
1013 const std::vector<XferDesPortInfo> &outputs_info,
1015 const void *fill_data,
size_t fill_size,
1036 XferDesID _guid,
const std::vector<XferDesPortInfo> &inputs_info,
1037 const std::vector<XferDesPortInfo> &outputs_info,
int _priority,
1045 const size_t in_span_start,
1046 const size_t out_span_start);
1049 const size_t in_span_start,
const size_t out_span_start,
1050 const size_t in_elem_size,
const size_t out_elem_size,
1051 const size_t elems,
const bool has_transpose);
1085 const std::vector<XferDesPortInfo> &inputs_info,
1086 const std::vector<XferDesPortInfo> &outputs_info,
1088 const void *fill_data,
size_t fill_size,
1089 size_t fill_total)
override;
1102 const std::vector<Channel::SupportedPath> &_paths);
1106 template <
typename S>
1109 template <
typename S>
1127#if !defined(REALM_IS_WINDOWS)
1131 const void *data,
size_t datalen);
1159 inline operator
bool(
void)
const {
return dev_ptr != 0; }
1173 if(has_ipc_handle) {
1176 return has_ipc_handle;
1178#if CUDA_VERSION >= 12030
1183 bool get_fabric_handle(CUmemFabricHandle &
handle)
const;
1193 inline CUdeviceptr
get_dptr(
void)
const {
return dev_ptr; }
1206 template <
typename T =
void>
1209 return static_cast<T *
>(host_ptr);
1231 bool shareable =
true);
1232#if CUDA_VERSION >= 11000
1246 static GPUAllocation *allocate_mmap(
GPU *gpu,
const CUmemAllocationProp &prop,
1247 size_t size, CUdeviceptr vaddr = 0,
1248 bool peer_enabled =
true);
1262 bool shareable =
true,
bool same_va =
true);
1282 bool peer_enabled =
true);
1298 bool peer_enabled =
true);
1299#if CUDA_VERSION >= 12030
1311 size_t size,
bool peer_enabled =
true,
1312 bool is_local =
false);
1316 CUresult map_allocation(
GPU *gpu, CUmemGenericAllocationHandle
handle,
size_t size,
1317 CUdeviceptr va = 0,
size_t offset = 0,
1318 bool peer_enabled =
false,
bool map_host =
false);
1320#if CUDA_VERSION >= 11000
1326 static size_t align_size(
const CUmemAllocationProp &prop,
size_t size);
1337#if CUDA_VERSION >= 11000
1345 CUdeviceptr dev_ptr = 0;
1347 void *host_ptr =
nullptr;
1351 DeleterCallback deleter =
nullptr;
1352#if CUDA_VERSION >= 11000
1354 CUmemGenericAllocationHandle mmap_handle = 0;
1357 bool owns_va =
true;
1360 bool has_ipc_handle =
false;
1362 CUipcMemHandle ipc_handle;
1369#if CUDA_VERSION < 11030
1370#define CU_GET_PROC_ADDRESS_DEFAULT 0
1374#if CUDA_VERSION < 12050
1378#if CUDA_VERSION >= 13000
1381#if !defined(cuCtxGetDevice)
1382#define cuCtxGetDevice cuCtxGetDevice_v2
1384#if !defined(cuCtxSynchronize)
1385#define cuCtxSynchronize cuCtxSynchronize_v2
1387#if !defined(cuStreamGetCtx)
1388#define cuStreamGetCtx cuStreamGetCtx_v2
1393#define CUDA_DRIVER_HAS_FNPTR(name) ((name##_fnptr) != nullptr)
1394#define CUDA_DRIVER_FNPTR(name) (assert(name##_fnptr != nullptr), name##_fnptr)
1404#define CUDA_VERSION_MIN 11080
1407#define CUDA_VERSION_COMPAT ((CUDA_VERSION / 1000) * 1000)
1409#define CUDA_DRIVER_APIS(__op__) \
1410 __op__(cuModuleGetFunction, CUDA_VERSION_MIN); \
1411 __op__(cuCtxGetDevice, CUDA_VERSION_MIN); \
1412 __op__(cuCtxEnablePeerAccess, CUDA_VERSION_MIN); \
1413 __op__(cuCtxGetFlags, CUDA_VERSION_MIN); \
1414 __op__(cuCtxGetStreamPriorityRange, CUDA_VERSION_MIN); \
1415 __op__(cuCtxPopCurrent, CUDA_VERSION_MIN); \
1416 __op__(cuCtxPushCurrent, CUDA_VERSION_MIN); \
1417 __op__(cuCtxSynchronize, CUDA_VERSION_MIN); \
1418 __op__(cuDeviceCanAccessPeer, CUDA_VERSION_MIN); \
1419 __op__(cuDeviceGet, CUDA_VERSION_MIN); \
1420 __op__(cuDeviceGetUuid, CUDA_VERSION_MIN); \
1421 __op__(cuDeviceGetAttribute, CUDA_VERSION_MIN); \
1422 __op__(cuDeviceGetCount, CUDA_VERSION_MIN); \
1423 __op__(cuDeviceGetName, CUDA_VERSION_MIN); \
1424 __op__(cuDevicePrimaryCtxRelease, CUDA_VERSION_MIN); \
1425 __op__(cuDevicePrimaryCtxRetain, CUDA_VERSION_MIN); \
1426 __op__(cuDevicePrimaryCtxSetFlags, CUDA_VERSION_MIN); \
1427 __op__(cuDeviceTotalMem, CUDA_VERSION_MIN); \
1428 __op__(cuEventCreate, CUDA_VERSION_MIN); \
1429 __op__(cuEventDestroy, CUDA_VERSION_MIN); \
1430 __op__(cuEventQuery, CUDA_VERSION_MIN); \
1431 __op__(cuEventRecord, CUDA_VERSION_MIN); \
1432 __op__(cuGetErrorName, CUDA_VERSION_MIN); \
1433 __op__(cuGetErrorString, CUDA_VERSION_MIN); \
1434 __op__(cuInit, CUDA_VERSION_MIN); \
1435 __op__(cuIpcCloseMemHandle, CUDA_VERSION_MIN); \
1436 __op__(cuIpcGetMemHandle, CUDA_VERSION_MIN); \
1437 __op__(cuIpcOpenMemHandle, CUDA_VERSION_MIN); \
1438 __op__(cuLaunchKernel, CUDA_VERSION_MIN); \
1439 __op__(cuMemAllocManaged, CUDA_VERSION_MIN); \
1440 __op__(cuMemAlloc, CUDA_VERSION_MIN); \
1441 __op__(cuMemcpy2DAsync, CUDA_VERSION_MIN); \
1442 __op__(cuMemcpy3DAsync, CUDA_VERSION_MIN); \
1443 __op__(cuMemcpyAsync, CUDA_VERSION_MIN); \
1444 __op__(cuMemcpyDtoDAsync, CUDA_VERSION_MIN); \
1445 __op__(cuMemcpyDtoH, CUDA_VERSION_MIN); \
1446 __op__(cuMemcpyDtoHAsync, CUDA_VERSION_MIN); \
1447 __op__(cuMemcpyHtoD, CUDA_VERSION_MIN); \
1448 __op__(cuMemcpyHtoDAsync, CUDA_VERSION_MIN); \
1449 __op__(cuMemFreeHost, CUDA_VERSION_MIN); \
1450 __op__(cuMemFree, CUDA_VERSION_MIN); \
1451 __op__(cuMemGetInfo, CUDA_VERSION_MIN); \
1452 __op__(cuMemHostAlloc, CUDA_VERSION_MIN); \
1453 __op__(cuMemHostGetDevicePointer, CUDA_VERSION_MIN); \
1454 __op__(cuMemHostRegister, CUDA_VERSION_MIN); \
1455 __op__(cuMemHostUnregister, CUDA_VERSION_MIN); \
1456 __op__(cuMemsetD16Async, CUDA_VERSION_MIN); \
1457 __op__(cuMemsetD2D16Async, CUDA_VERSION_MIN); \
1458 __op__(cuMemsetD2D32Async, CUDA_VERSION_MIN); \
1459 __op__(cuMemsetD2D8Async, CUDA_VERSION_MIN); \
1460 __op__(cuMemsetD32Async, CUDA_VERSION_MIN); \
1461 __op__(cuMemsetD8Async, CUDA_VERSION_MIN); \
1462 __op__(cuModuleLoadDataEx, CUDA_VERSION_MIN); \
1463 __op__(cuStreamAddCallback, CUDA_VERSION_MIN); \
1464 __op__(cuStreamCreate, CUDA_VERSION_MIN); \
1465 __op__(cuStreamCreateWithPriority, CUDA_VERSION_MIN); \
1466 __op__(cuStreamDestroy, CUDA_VERSION_MIN); \
1467 __op__(cuStreamSynchronize, CUDA_VERSION_MIN); \
1468 __op__(cuOccupancyMaxPotentialBlockSize, CUDA_VERSION_MIN); \
1469 __op__(cuOccupancyMaxPotentialBlockSizeWithFlags, CUDA_VERSION_MIN); \
1470 __op__(cuEventSynchronize, CUDA_VERSION_MIN); \
1471 __op__(cuEventElapsedTime, CUDA_VERSION_MIN); \
1472 __op__(cuOccupancyMaxActiveBlocksPerMultiprocessor, CUDA_VERSION_MIN); \
1473 __op__(cuMemAddressReserve, CUDA_VERSION_MIN); \
1474 __op__(cuMemAddressFree, CUDA_VERSION_MIN); \
1475 __op__(cuMemCreate, CUDA_VERSION_MIN); \
1476 __op__(cuMemRelease, CUDA_VERSION_MIN); \
1477 __op__(cuMemMap, CUDA_VERSION_MIN); \
1478 __op__(cuMemUnmap, CUDA_VERSION_MIN); \
1479 __op__(cuMemSetAccess, CUDA_VERSION_MIN); \
1480 __op__(cuMemGetAllocationGranularity, CUDA_VERSION_MIN); \
1481 __op__(cuMemGetAllocationPropertiesFromHandle, CUDA_VERSION_MIN); \
1482 __op__(cuMemExportToShareableHandle, CUDA_VERSION_MIN); \
1483 __op__(cuMemImportFromShareableHandle, CUDA_VERSION_MIN); \
1484 __op__(cuStreamWaitEvent, CUDA_VERSION_MIN); \
1485 __op__(cuStreamQuery, CUDA_VERSION_MIN); \
1486 __op__(cuMemGetAddressRange, CUDA_VERSION_MIN); \
1487 __op__(cuPointerGetAttributes, CUDA_VERSION_MIN); \
1488 __op__(cuDriverGetVersion, CUDA_VERSION_MIN); \
1489 __op__(cuMemAdvise, CUDA_VERSION_MIN); \
1490 __op__(cuMemPrefetchAsync, CUDA_VERSION_MIN); \
1491 __op__(cuCtxSetSharedMemConfig, CUDA_VERSION_MIN); \
1492 __op__(cuCtxSetCacheConfig, CUDA_VERSION_MIN); \
1493 __op__(cuCtxSetLimit, CUDA_VERSION_MIN); \
1494 __op__(cuCtxGetLimit, CUDA_VERSION_MIN); \
1495 __op__(cuFuncSetAttribute, CUDA_VERSION_MIN); \
1496 __op__(cuFuncSetCacheConfig, CUDA_VERSION_MIN); \
1497 __op__(cuFuncSetSharedMemConfig, CUDA_VERSION_MIN); \
1498 __op__(cuFuncGetAttribute, CUDA_VERSION_MIN); \
1499 __op__(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, CUDA_VERSION_MIN); \
1500 __op__(cuArray3DCreate, CUDA_VERSION_MIN); \
1501 __op__(cuArrayDestroy, CUDA_VERSION_MIN); \
1502 __op__(cuSurfObjectCreate, CUDA_VERSION_MIN); \
1503 __op__(cuSurfObjectDestroy, CUDA_VERSION_MIN); \
1504 __op__(cuLaunchCooperativeKernel, CUDA_VERSION_MIN); \
1505 __op__(cuModuleGetGlobal, CUDA_VERSION_MIN); \
1506 __op__(cuLaunchHostFunc, CUDA_VERSION_MIN); \
1507 __op__(cuCtxRecordEvent, 12050); \
1508 __op__(cuArrayGetMemoryRequirements, CUDA_VERSION_MIN);
1511#define DECL_FNPTR_EXTERN(name, ver) extern decltype(&name) name##_fnptr;
1513#undef DECL_FNPTR_EXTERN
1515#define NVML_FNPTR(name) (name##_fnptr)
1517#if NVML_API_VERSION >= 11
1518#define NVML_11_APIS(__op__) __op__(nvmlDeviceGetMemoryAffinity);
1520#define NVML_11_APIS(__op__)
1523#if NVML_API_VERSION >= 12
1524#define NVML_12_APIS(__op__) __op__(nvmlDeviceGetGpuFabricInfo)
1526#define NVML_12_APIS(__op__)
1529#if CUDA_VERSION < 11040
1550#define NVML_APIS(__op__) \
1552 __op__(nvmlDeviceGetHandleByUUID); \
1553 __op__(nvmlDeviceGetMaxPcieLinkWidth); \
1554 __op__(nvmlDeviceGetMaxPcieLinkGeneration); \
1555 __op__(nvmlDeviceGetNvLinkState); \
1556 __op__(nvmlDeviceGetNvLinkVersion); \
1557 __op__(nvmlDeviceGetNvLinkRemotePciInfo); \
1558 __op__(nvmlDeviceGetNvLinkRemoteDeviceType); \
1559 __op__(nvmlDeviceGetDeviceHandleFromMigDeviceHandle); \
1560 __op__(nvmlDeviceGetFieldValues); \
1561 NVML_11_APIS(__op__); \
1562 NVML_12_APIS(__op__);
1564#define DECL_FNPTR_EXTERN(name) extern decltype(&name) name##_fnptr;
1566#undef DECL_FNPTR_EXTERN
1568#define CUPTI_APIS(__op__) \
1569 __op__(cuptiActivityRegisterCallbacks); \
1570 __op__(cuptiActivityEnable); \
1571 __op__(cuptiActivityDisable); \
1572 __op__(cuptiActivityEnableContext); \
1573 __op__(cuptiActivityDisableContext); \
1574 __op__(cuptiActivityFlushAll); \
1575 __op__(cuptiActivityGetNextRecord); \
1576 __op__(cuptiActivityRegisterTimestampCallback); \
1577 __op__(cuptiActivityPushExternalCorrelationId); \
1578 __op__(cuptiActivityPopExternalCorrelationId);
1580#define DECL_FNPTR_EXTERN(name) extern decltype(&name) name##_fnptr;
1582#undef DECL_FNPTR_EXTERN
1584#define CUPTI_HAS_FNPTR(name) (name##_fnptr != nullptr)
1585#define CUPTI_FNPTR(name) (assert(name##_fnptr != nullptr), name##_fnptr)
bootstrap_handle_t * handle
Definition bootstrap.h:61
Definition address_list.h:100
Definition bytearray.h:30
Definition bytearray.h:53
Definition circ_queue.h:35
Definition codedesc.h:249
Definition cuda_internal.h:781
int dim
Definition cuda_internal.h:789
CUarray array
Definition cuda_internal.h:788
size_t width_in_bytes
Definition cuda_internal.h:791
size_t height
Definition cuda_internal.h:791
size_t pos[3]
Definition cuda_internal.h:790
size_t depth
Definition cuda_internal.h:791
virtual int set_rect(const RegionInstanceImpl *inst, const InstanceLayoutPieceBase *piece, size_t field_size, size_t field_offset, int ndims, const int64_t lo[], const int64_t hi[], const int order[])
Definition cuda_internal.h:543
AutoGPUContext(GPU *_gpu)
AutoGPUContext(GPU &_gpu)
GPU * gpu
Definition cuda_internal.h:550
Definition cuda_internal.h:360
Mutex mutex
Definition cuda_internal.h:376
std::vector< Thread * > worker_threads
Definition cuda_internal.h:381
int total_threads
Definition cuda_internal.h:380
CoreReservation * core_rsrv
Definition cuda_internal.h:382
int max_threads
Definition cuda_internal.h:375
void add_fence(GPUWorkFence *fence)
ContextSynchronizer(GPU *_gpu, CUcontext _context, CoreReservationSet &crs, int _max_threads)
GPU * gpu
Definition cuda_internal.h:373
int syncing_threads
Definition cuda_internal.h:380
Mutex::CondVar condvar
Definition cuda_internal.h:377
CUcontext context
Definition cuda_internal.h:374
int sleeping_threads
Definition cuda_internal.h:380
bool shutdown_flag
Definition cuda_internal.h:378
GPUWorkFence::FenceList fences
Definition cuda_internal.h:379
Definition cuda_internal.h:592
CudaDeviceMemoryInfo(CUcontext _context)
GPU * gpu
Definition cuda_internal.h:597
CUcontext context
Definition cuda_internal.h:596
Definition cuda_module.h:181
Class for managing the lifetime of a given gpu allocation. As instances of this class own an underlyi...
Definition cuda_internal.h:1148
static GPUAllocation * open_handle(GPU *gpu, OsHandle hdl, size_t size, bool peer_enabled=true)
Retrieves the GPUAllocation given the OsHandle.
static GPUAllocation * register_allocation(GPU *gpu, void *ptr, size_t size, bool peer_enabled=true)
Create an allocation that registers the given CPU address range with CUDA, making it accessible from ...
static GPUAllocation * allocate_host(GPU *gpu, size_t size, bool peer_enabled=true, bool shareable=true, bool same_va=true)
Allocate CPU-located memory for the given gpu with the given size and features.
T * get_hptr(void) const
Retrieves the CPU accessible base address for the allocation, or nullptr if there is no way to access...
Definition cuda_internal.h:1207
static GPUAllocation * allocate_managed(GPU *gpu, size_t size)
Allocate migratable memory that can be used with CUDA's managed memory APIs (cuMemPrefetchAsync,...
GPUAllocation(const GPUAllocation &)=delete
size_t get_size(void) const
Retrieves the given size of the allocation.
Definition cuda_internal.h:1199
OsHandle get_os_handle(void) const
Accessor for the file descriptor or win32 HANDLE associated with the allocation. This handle can be s...
bool get_ipc_handle(CUipcMemHandle &handle) const
Retrieves the CUipcMemHandle for this allocation that can be used with GPUAllocation::open_ipc.
Definition cuda_internal.h:1171
static GPUAllocation * allocate_dev(GPU *gpu, size_t size, bool peer_enabled=true, bool shareable=true)
Allocates device-located memory for the given gpu with the given size and features.
GPUAllocation & operator=(GPUAllocation &&) noexcept
GPUAllocation(void)=default
static void * get_win32_shared_attributes(void)
Retrieves the default win32 shared attributes for creating a shared object that can be set in CUmemAl...
static GPUAllocation * open_ipc(GPU *gpu, const CUipcMemHandle &mem_hdl)
Retrieves the GPUAllocation given the CUipcMemHandle.
GPUAllocation(GPUAllocation &&other) noexcept
CUdeviceptr get_dptr(void) const
Retrieves the base CUdeviceptr for the associated allocation that can be used to access the underlyin...
Definition cuda_internal.h:1193
GPU * get_gpu(void) const
Retrieves the owning GPU.
Definition cuda_internal.h:1196
Definition cuda_internal.h:924
virtual bool support_idindexed_fields(Memory src_mem, Memory dst_mem) const
Definition cuda_internal.h:946
GPUChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork)
static const bool is_ordered
Definition cuda_internal.h:931
long submit(Request **requests, long nr)
virtual RemoteChannelInfo * construct_remote_info() const
GPU * get_gpu() const
Definition cuda_internal.h:942
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
Definition cuda_internal.h:718
GPURequest * req
Definition cuda_internal.h:722
void request_completed(void)
Definition cuda_internal.h:196
virtual ~GPUCompletionNotification(void)
Definition cuda_internal.h:198
virtual void request_completed(void)=0
Definition cuda_internal.h:183
void destroy_context(InternalTask *task, void *context) const override
void destroy_context(Task *task, void *context) const override
GPU * gpu
Definition cuda_internal.h:190
void * create_context(Task *task) const override
GPUContextManager(GPU *_gpu, GPUProcessor *proc)
void * create_context(InternalTask *task) const override
GPUProcessor * proc
Definition cuda_internal.h:191
Definition cuda_internal.h:631
GPU * gpu
Definition cuda_internal.h:667
virtual void release_storage_immediate(RegionInstanceImpl *inst, bool poisoned, TimeLimit work_until)
size_t cur_size
Definition cuda_internal.h:669
NetworkSegment local_segment
Definition cuda_internal.h:671
virtual void * get_direct_ptr(off_t offset, size_t size)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
virtual AllocationResult allocate_storage_immediate(RegionInstanceImpl *inst, bool need_alloc_result, bool poisoned, TimeLimit work_until)
GPUDynamicFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, size_t _max_size)
Mutex mutex
Definition cuda_internal.h:668
std::map< RegionInstance, std::pair< CUdeviceptr, size_t > > alloc_bases
Definition cuda_internal.h:670
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void get_bytes(off_t offset, void *dst, size_t size)
virtual ~GPUDynamicFBMemory(void)
virtual void put_bytes(off_t offset, const void *src, size_t size)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
Definition cuda_internal.h:337
CUevent get_event(bool external=false)
Mutex mutex
Definition cuda_internal.h:351
void init_pool(int init_size=0)
int batch_size
Definition cuda_internal.h:352
void return_event(CUevent e, bool external=false)
std::vector< CUevent > available_events
Definition cuda_internal.h:353
GPUEventPool(int _batch_size=256)
int total_size
Definition cuda_internal.h:352
int current_size
Definition cuda_internal.h:352
int external_count
Definition cuda_internal.h:352
Definition cuda_internal.h:705
GPUFBIBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base, size_t _size)
NetworkSegment local_segment
Definition cuda_internal.h:713
CUdeviceptr base
Definition cuda_internal.h:712
GPU * gpu
Definition cuda_internal.h:711
Definition cuda_internal.h:600
NetworkSegment local_segment
Definition cuda_internal.h:628
GPU * gpu
Definition cuda_internal.h:626
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void * get_direct_ptr(off_t offset, size_t size)
CUdeviceptr base
Definition cuda_internal.h:627
virtual void put_bytes(off_t offset, const void *src, size_t size)
GPUFBMemory(RuntimeImpl *_runtime_impl, Memory _me, GPU *_gpu, CUdeviceptr _base, size_t _size)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
virtual void get_bytes(off_t offset, void *dst, size_t size)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
virtual ~GPUFBMemory(void)
Definition cuda_internal.h:848
GPU * get_gpu() const
Definition cuda_internal.h:880
static const bool is_ordered
Definition cuda_internal.h:855
GPU * src_gpu
Definition cuda_internal.h:884
virtual bool supports_indirection_memory(Memory mem) const
Queries if a given mem can be used as an indirection buffer.
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
virtual Memory suggest_ib_memories() const
GPUIndirectChannel(GPU *_src_gpu, XferDesKind _kind, BackgroundWorkManager *bgwork)
long submit(Request **requests, long nr)
virtual RemoteChannelInfo * construct_remote_info() const
virtual bool needs_wrapping_iterator() const
virtual uint64_t supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id, CustomSerdezID dst_serdez_id, ReductionOpID redop_id, size_t total_bytes, const std::vector< size_t > *src_frags, const std::vector< size_t > *dst_frags, XferDesKind *kind_ret=0, unsigned *bw_ret=0, unsigned *lat_ret=0)
Definition cuda_internal.h:887
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPUIndirectRemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:905
bool serialize(S &serializer) const
GPUIndirectRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths, const std::vector< Memory > &_indirect_memories)
virtual RemoteChannel * create_remote_channel()
static RemoteChannelInfo * deserialize_new(S &deserializer)
Definition cuda_internal.h:908
virtual bool needs_wrapping_iterator() const
virtual Memory suggest_ib_memories() const
virtual uint64_t supports_path(ChannelCopyInfo channel_copy_info, CustomSerdezID src_serdez_id, CustomSerdezID dst_serdez_id, ReductionOpID redop_id, size_t total_bytes, const std::vector< size_t > *src_frags, const std::vector< size_t > *dst_frags, XferDesKind *kind_ret, unsigned *bw_ret, unsigned *lat_ret)
GPUIndirectRemoteChannel(uintptr_t _remote_ptr, const std::vector< Memory > &_indirect_memories)
Definition cuda_internal.h:734
size_t read_ind_offset
Definition cuda_internal.h:750
size_t write_size
Definition cuda_internal.h:752
size_t write_offset
Definition cuda_internal.h:752
int write_ind_port_idx
Definition cuda_internal.h:753
size_t read_offset
Definition cuda_internal.h:748
size_t write_ind_offset
Definition cuda_internal.h:754
size_t read_ind_size
Definition cuda_internal.h:750
GPUIndirectTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size, int _write_port_idx, size_t _write_offset, size_t _write_size, int _read_ind_port_idx=-1, size_t _read_ind_offset=0, size_t _read_ind_size=0, int _write_ind_port_idx=-1, size_t _write_ind_offset=0, size_t _write_ind_size=0)
int write_port_idx
Definition cuda_internal.h:751
virtual void request_completed(void)
size_t write_ind_size
Definition cuda_internal.h:754
int read_ind_port_idx
Definition cuda_internal.h:749
XferDes * xd
Definition cuda_internal.h:746
size_t read_size
Definition cuda_internal.h:748
int read_port_idx
Definition cuda_internal.h:747
Definition cuda_internal.h:832
bool progress_xd(GPUIndirectChannel *channel, TimeLimit work_until)
std::vector< bool > dst_is_ipc
Definition cuda_internal.h:844
std::vector< GPU * > dst_gpus
Definition cuda_internal.h:843
long get_requests(Request **requests, long nr)
std::vector< GPU * > src_gpus
Definition cuda_internal.h:843
GPUIndirectXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, XferDesRedopInfo _redop_info)
Definition cuda_internal.h:554
virtual void shutdown(void)
Realm::CoreReservation * core_rsrv
Definition cuda_internal.h:574
virtual bool register_task(Processor::TaskFuncID func_id, CodeDescriptor &codedesc, const ByteArrayRef &user_data)
virtual ~GPUProcessor(void)
GPU * gpu
Definition cuda_internal.h:571
GPUProcessor(RuntimeImpl *runtime_impl, GPU *_gpu, Processor _me, Realm::CoreReservationSet &crs, size_t _stack_size)
std::map< Processor::TaskFuncID, GPUTaskTableEntry > gpu_task_table
Definition cuda_internal.h:584
virtual void execute_task(Processor::TaskFuncID func_id, const ByteArrayRef &task_args)
Definition cuda_internal.h:955
virtual RemoteChannel * create_remote_channel()
GPURemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths)
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPURemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:971
static RemoteChannelInfo * deserialize_new(S &deserializer)
bool serialize(S &serializer) const
Definition cuda_internal.h:974
virtual bool support_idindexed_fields(Memory src_mem, Memory dst_mem) const
Definition cuda_internal.h:980
Definition cuda_internal.h:1134
GPUReplHeapListener(CudaModule *_module)
virtual void chunk_created(void *base, size_t bytes)
virtual void chunk_destroyed(void *base, size_t bytes)
Definition cuda_internal.h:725
GPUCompletionEvent event
Definition cuda_internal.h:731
void * dst_base
Definition cuda_internal.h:728
const void * src_base
Definition cuda_internal.h:727
GPU * dst_gpu
Definition cuda_internal.h:730
Definition cuda_internal.h:247
bool ok_to_submit_copy(size_t bytes, XferDes *xd)
REALM_INTERNAL_API_EXTERNAL_LINKAGE CUstream get_stream(void) const
void add_notification(GPUCompletionNotification *notification)
void add_event(CUevent event, GPUWorkFence *fence, GPUCompletionNotification *notification=NULL, GPUWorkStart *start=NULL)
Mutex mutex
Definition cuda_internal.h:281
void add_start_event(GPUWorkStart *start)
bool has_work(void) const
GPU * gpu
Definition cuda_internal.h:276
GPU * get_gpu(void) const
GPUStream(GPU *_gpu, GPUWorker *_worker, int rel_priority=0)
GPUWorker * worker
Definition cuda_internal.h:277
bool reap_events(TimeLimit work_until)
void add_fence(GPUWorkFence *fence)
std::deque< PendingEvent > pending_events
Definition cuda_internal.h:291
void wait_on_streams(const std::set< GPUStream * > &other_streams)
CUstream stream
Definition cuda_internal.h:279
Definition cuda_internal.h:757
size_t read_offset
Definition cuda_internal.h:768
int write_port_idx
Definition cuda_internal.h:769
XferDes * xd
Definition cuda_internal.h:766
size_t write_size
Definition cuda_internal.h:770
size_t write_offset
Definition cuda_internal.h:770
GPUTransferCompletion(XferDes *_xd, int _read_port_idx, size_t _read_offset, size_t _read_size, int _write_port_idx, size_t _write_offset, size_t _write_size)
virtual void request_completed(void)
size_t read_size
Definition cuda_internal.h:768
int read_port_idx
Definition cuda_internal.h:767
Definition cuda_internal.h:203
virtual void request_cancellation(void)
IntrusiveListLink< GPUWorkFence > fence_list_link
Definition cuda_internal.h:216
GPU * gpu
Definition cuda_internal.h:224
void enqueue_on_stream(GPUStream *stream)
virtual void print(std::ostream &os) const
IntrusiveList< GPUWorkFence, REALM_PMTA_USE(GPUWorkFence, fence_list_link), DummyLock > FenceList
Definition cuda_internal.h:220
GPUWorkFence(GPU *gpu, Realm::Operation *op)
REALM_PMTA_DEFN(GPUWorkFence, IntrusiveListLink< GPUWorkFence >, fence_list_link)
static void cuda_callback(CUstream stream, CUresult res, void *data)
virtual void mark_finished(bool successful)
Definition cuda_internal.h:227
void mark_gpu_work_start()
void enqueue_on_stream(GPUStream *stream)
GPUWorkStart(Realm::Operation *op)
virtual void request_cancellation(void)
Definition cuda_internal.h:231
virtual void print(std::ostream &os) const
static void cuda_start_callback(CUstream stream, CUresult res, void *data)
Definition cuda_internal.h:298
bool process_streams(bool sleep_on_empty)
CircularQueue< GPUStream *, 16 > ActiveStreamQueue
Definition cuda_internal.h:325
void shutdown_background_thread(void)
ActiveStreamQueue active_streams
Definition cuda_internal.h:326
void add_stream(GPUStream *s)
void start_background_thread(Realm::CoreReservationSet &crs, size_t stack_size)
Realm::CoreReservation * core_rsrv
Definition cuda_internal.h:329
Mutex::CondVar condvar
Definition cuda_internal.h:323
Mutex lock
Definition cuda_internal.h:322
atomic< bool > worker_shutdown_requested
Definition cuda_internal.h:332
bool do_work(TimeLimit work_until)
bool thread_sleeping
Definition cuda_internal.h:331
Realm::Thread * worker_thread
Definition cuda_internal.h:330
Definition cuda_internal.h:796
bool progress_xd(GPUChannel *channel, TimeLimit work_until)
GPUXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority)
static size_t read_address_entry(AffineCopyInfo< 3 > ©_infos, size_t &min_align, MemcpyTransposeInfo< size_t > &transpose_info, AddressListCursor &in_alc, uintptr_t in_base, AddressListCursor &out_alc, uintptr_t out_base, size_t bytes_left, size_t max_xfer_fields, size_t &fields_total)
long get_requests(Request **requests, long nr)
Definition cuda_internal.h:674
NetworkSegment local_segment
Definition cuda_internal.h:702
virtual ~GPUZCMemory(void)
char * cpu_base
Definition cuda_internal.h:701
virtual void * get_direct_ptr(off_t offset, size_t size)
virtual bool attempt_register_external_resource(RegionInstanceImpl *inst, size_t &inst_offset)
virtual void unregister_external_resource(RegionInstanceImpl *inst)
GPUZCMemory(RuntimeImpl *_runtime_impl, GPU *gpu, Memory _me, CUdeviceptr _gpu_base, void *_cpu_base, size_t _size, MemoryKind _kind, Memory::Kind _lowlevel_kind)
virtual ExternalInstanceResource * generate_resource_info(RegionInstanceImpl *inst, const IndexSpaceGeneric *subspace, span< const FieldID > fields, bool read_only)
virtual void put_bytes(off_t offset, const void *src, size_t size)
CUdeviceptr gpu_base
Definition cuda_internal.h:700
virtual void get_bytes(off_t offset, void *dst, size_t size)
Definition cuda_internal.h:392
void create_fb_memory(RuntimeImpl *runtime, size_t size, size_t ib_size)
void launch_transpose_kernel(MemcpyTransposeInfo< size_t > ©_info, size_t elemSize, GPUStream *stream)
void create_dynamic_fb_memory(RuntimeImpl *runtime, size_t max_size)
CUdeviceptr fbmem_base
Definition cuda_internal.h:482
GPUFuncInfo fill_affine_large_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:479
std::vector< CudaIpcMapping > cudaipc_mappings
Definition cuda_internal.h:519
void launch_batch_affine_kernel(void *copy_info, size_t dim, size_t elemSize, size_t volume, bool multified_optimized, GPUStream *stream)
bool can_access_peer(const GPU *peer) const
GPUFBMemory * fbmem
Definition cuda_internal.h:449
GPUFuncInfo transpose_kernels[CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:480
std::unordered_map< ReductionOpID, GPUReductionOpEntry > gpu_reduction_table
Definition cuda_internal.h:539
REALM_INTERNAL_API_EXTERNAL_LINKAGE GPUStream * get_null_task_stream(void) const
GPUStream * device_to_host_stream
Definition cuda_internal.h:497
GPUProcessor * proc
Definition cuda_internal.h:446
GPUFuncInfo batch_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:473
std::vector< GPUStream * > task_streams
Definition cuda_internal.h:501
ContextSynchronizer ctxsync
Definition cuda_internal.h:442
CUmodule device_module
Definition cuda_internal.h:455
void create_processor(RuntimeImpl *runtime, size_t stack_size)
Mutex alloc_mutex
Definition cuda_internal.h:521
std::set< Memory > managed_mems
Definition cuda_internal.h:490
GPUStream * host_to_device_stream
Definition cuda_internal.h:496
GPUStream * get_next_d2d_stream()
GPUFuncInfo indirect_copy_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:472
GPUFuncInfo multi_batch_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:475
GPUStream * device_to_device_stream
Definition cuda_internal.h:498
size_t cupti_activity_refcount
Definition cuda_internal.h:504
std::map< NodeID, GPUStream * > cudaipc_streams
Definition cuda_internal.h:520
std::map< CUdeviceptr, GPUAllocation > allocations
Definition cuda_internal.h:448
std::set< Memory > pinned_sysmems
Definition cuda_internal.h:487
GPUFuncInfo batch_fill_affine_kernels[REALM_MAX_DIM][CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES]
Definition cuda_internal.h:477
static const size_t CUDA_MEMCPY_KERNEL_MAX2_LOG2_BYTES
Definition cuda_internal.h:469
GPUStream * get_next_task_stream(bool create=false)
void launch_indirect_copy_kernel(void *copy_info, size_t dim, size_t addr_size, size_t field_size, size_t volume, GPUStream *stream)
CUmodule load_cuda_module(const void *data)
int least_stream_priority
Definition cuda_internal.h:510
CUdeviceptr fb_ibmem_base
Definition cuda_internal.h:484
bool register_reduction(ReductionOpID redop_id, CUfunction apply_excl, CUfunction apply_nonexcl, CUfunction fold_excl, CUfunction fold_nonexcl, CUfunction apply_excl_advanced, CUfunction apply_nonexcl_advanced, CUfunction fold_excl_advanced, CUfunction fold_nonexcl_advanced, CUfunction apply_excl_transpose, CUfunction apply_nonexcl_transpose, CUfunction fold_excl_transpose, CUfunction fold_nonexcl_transpose)
GPUAllocation & add_allocation(GPUAllocation &&alloc)
GPUDynamicFBMemory * fb_dmem
Definition cuda_internal.h:450
void launch_batch_affine_fill_kernel(void *fill_info, size_t dim, size_t elemSize, size_t volume, GPUStream *stream)
int greatest_stream_priority
Definition cuda_internal.h:510
const CudaIpcMapping * find_ipc_mapping(Memory mem) const
GPUEventPool event_pool
Definition cuda_internal.h:506
CUcontext context
Definition cuda_internal.h:453
GPU(CudaModule *_module, GPUInfo *_info, GPUWorker *worker, CUcontext _context)
std::vector< GPUStream * > device_to_device_streams
Definition cuda_internal.h:499
GPUFBIBMemory * fb_ibmem
Definition cuda_internal.h:451
atomic< unsigned > next_task_stream
Definition cuda_internal.h:502
GPUStream * find_stream(CUstream stream) const
bool is_accessible_host_mem(const MemoryImpl *mem) const
GPUInfo * info
Definition cuda_internal.h:444
std::vector< GPUStream * > peer_to_peer_streams
Definition cuda_internal.h:500
GPUWorker * worker
Definition cuda_internal.h:445
void create_dma_channels(Realm::RuntimeImpl *r)
std::set< Memory > peer_fbs
Definition cuda_internal.h:493
atomic< unsigned > next_d2d_stream
Definition cuda_internal.h:503
bool is_accessible_gpu_mem(const MemoryImpl *mem) const
Definition cuda_internal.h:1003
static const bool is_ordered
Definition cuda_internal.h:1008
GPUfillChannel(GPU *_gpu, BackgroundWorkManager *bgwork)
virtual XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total)
long submit(Request **requests, long nr)
GPU * gpu
Definition cuda_internal.h:1023
Definition cuda_internal.h:988
GPUfillXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, const void *_fill_data, size_t _fill_size, size_t _fill_total)
size_t reduced_fill_size
Definition cuda_internal.h:1000
long get_requests(Request **requests, long nr)
bool progress_xd(GPUfillChannel *channel, TimeLimit work_until)
Definition cuda_internal.h:1072
GPU * gpu
Definition cuda_internal.h:1096
RemoteChannelInfo * construct_remote_info() const override
static const bool is_ordered
Definition cuda_internal.h:1077
XferDes * create_xfer_des(uintptr_t dma_op, NodeID launch_node, XferDesID guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int priority, XferDesRedopInfo redop_info, const void *fill_data, size_t fill_size, size_t fill_total) override
long submit(Request **requests, long nr) override
bool supports_redop(ReductionOpID redop_id) const override
GPUreduceChannel(GPU *_gpu, BackgroundWorkManager *bgwork)
Definition cuda_internal.h:1099
GPUreduceRemoteChannelInfo(NodeID _owner, XferDesKind _kind, uintptr_t _remote_ptr, const std::vector< Channel::SupportedPath > &_paths)
bool serialize(S &serializer) const
virtual RemoteChannel * create_remote_channel()
static Serialization::PolymorphicSerdezSubclass< RemoteChannelInfo, GPUreduceRemoteChannelInfo > serdez_subclass
Definition cuda_internal.h:1115
static RemoteChannelInfo * deserialize_new(S &deserializer)
Definition cuda_internal.h:1118
Definition cuda_internal.h:1033
long get_requests(Request **requests, long nr)
const void * kernel_host_proxy_advanced
Definition cuda_internal.h:1065
std::vector< bool > src_is_ipc
Definition cuda_internal.h:1069
GPUStream * stream
Definition cuda_internal.h:1067
void record_redop_advanced_kernel(GPU *gpu)
GPUreduceXferDes(uintptr_t _dma_op, Channel *_channel, NodeID _launch_node, XferDesID _guid, const std::vector< XferDesPortInfo > &inputs_info, const std::vector< XferDesPortInfo > &outputs_info, int _priority, XferDesRedopInfo _redop_info)
const void * kernel_host_proxy
Definition cuda_internal.h:1064
bool progress_xd(GPUreduceChannel *channel, TimeLimit work_until)
const void * kernel_host_proxy_transpose
Definition cuda_internal.h:1066
CUfunction kernel
Definition cuda_internal.h:1061
XferDesRedopInfo redop_info
Definition cuda_internal.h:1059
std::vector< GPU * > src_gpus
Definition cuda_internal.h:1068
bool fast_reduction_kernel_mode(GPUreduceChannel *channel, const size_t max_bytes, XferPort *in_port, XferPort *out_port, const size_t in_span_start, const size_t out_span_start)
const ReductionOpUntyped * redop
Definition cuda_internal.h:1060
KernelVariantDesc describe_kernel_variant(GPU *cpu, bool is_advanced)
CUfunction kernel_transpose
Definition cuda_internal.h:1063
void setup_redop_kernel(GPUreduceChannel *channel, void *params, const size_t in_span_start, const size_t out_span_start, const size_t in_elem_size, const size_t out_elem_size, const size_t elems, const bool has_transpose)
bool resolve_kernel_slot(GPU *gpu, void *host_proxy, CUfunction &kernel_out, CUfunction GPU::GPUReductionOpEntry::*cache_field)
CUfunction kernel_advanced
Definition cuda_internal.h:1062
Definition cuda_internal.h:773
MemSpecificCudaArray(CUarray _array)
CUarray array
Definition cuda_internal.h:778
virtual ~MemSpecificCudaArray()
Definition instance.h:405
Definition ib_memory.h:30
Definition indexspace.h:1115
Definition inst_layout.h:267
Definition mem_impl.h:344
Definition proc_impl.h:141
Definition mem_impl.h:212
MemoryKind
Definition mem_impl.h:53
size_t size
Definition mem_impl.h:195
AllocationResult
Definition mem_impl.h:89
Kind
Definition memory.h:59
Definition operation.h:75
Operation * op
Definition operation.h:87
Definition operation.h:32
Definition processor.h:37
::realm_task_func_id_t TaskFuncID
Definition processor.h:58
Definition inst_impl.h:54
Definition repl_heap.h:50
Definition runtime_impl.h:265
Definition serialize.h:363
Definition channel.h:1019
Channel * channel
Definition channel.h:343
#define REALM_INTERNAL_API_EXTERNAL_LINKAGE
Definition compiler_support.h:218
#define CUDA_DRIVER_APIS(__op__)
Definition cuda_internal.h:1409
#define NVML_APIS(__op__)
Definition cuda_internal.h:1550
#define DECL_FNPTR_EXTERN(name, ver)
Definition cuda_internal.h:1511
#define CUPTI_APIS(__op__)
Definition cuda_internal.h:1568
#define cudaDeviceProp
Definition hip_cuda.h:24
#define REALM_PMTA_USE(structtype, name)
Definition lists.h:42
CudaModule * cuda_module_singleton
CUresult cuGetProcAddress(const char *, void **, int, int)
CUresult cuCtxRecordEvent(CUcontext hctx, CUevent event)
GPUMemcpyKind
Definition cuda_internal.h:162
@ GPU_MEMCPY_PEER_TO_PEER
Definition cuda_internal.h:166
@ GPU_MEMCPY_HOST_TO_DEVICE
Definition cuda_internal.h:163
@ GPU_MEMCPY_DEVICE_TO_HOST
Definition cuda_internal.h:164
@ GPU_MEMCPY_DEVICE_TO_DEVICE
Definition cuda_internal.h:165
nvmlReturn_t nvmlDeviceGetNvLinkRemoteDeviceType(nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t *pNvLinkDeviceType)
nvmlIntNvLinkDeviceType_enum
Definition cuda_internal.h:1538
@ NVML_NVLINK_DEVICE_TYPE_IBMNPU
Definition cuda_internal.h:1540
@ NVML_NVLINK_DEVICE_TYPE_SWITCH
Definition cuda_internal.h:1541
@ NVML_NVLINK_DEVICE_TYPE_UNKNOWN
Definition cuda_internal.h:1542
@ NVML_NVLINK_DEVICE_TYPE_GPU
Definition cuda_internal.h:1539
enum Realm::Cuda::nvmlIntNvLinkDeviceType_enum nvmlIntNvLinkDeviceType_t
Definition activemsg.h:38
int NodeID
Definition nodeset.h:40
XferDesKind
Definition channel.h:85
int CustomSerdezID
Definition custom_serdez.h:148
int OsHandle
Definition utils.h:399
unsigned long long XferDesID
Definition channel.h:57
::realm_reduction_op_id_t ReductionOpID
Definition event.h:38
#define REALM_MAX_DIM
Definition realm_config.h:34
Definition cuda_memcpy.h:102
Definition cuda_internal.h:1125
static void handle_message(NodeID sender, const CudaIpcImportRequest &args, const void *data, size_t datalen)
unsigned count
Definition cuda_internal.h:1126
long hostid
Definition cuda_internal.h:1128
Definition cudart_hijack.h:53
Definition cuda_internal.h:127
int pci_busid
Definition cuda_internal.h:141
CUdevice device
Definition cuda_internal.h:129
size_t pci_bandwidth
Definition cuda_internal.h:145
std::vector< size_t > logical_peer_bandwidth
Definition cuda_internal.h:148
int pci_domainid
Definition cuda_internal.h:142
CUuuid uuid
Definition cuda_internal.h:131
std::set< CUdevice > peers
Definition cuda_internal.h:140
bool has_numa_preference
Definition cuda_internal.h:138
bool pageable_access_supported
Definition cuda_internal.h:154
std::vector< size_t > logical_peer_latency
Definition cuda_internal.h:149
bool host_gpu_same_va
Definition cuda_internal.h:147
unsigned fabric_clique
Definition cuda_internal.h:152
bool fabric_supported
Definition cuda_internal.h:151
char name[MAX_NAME_LEN]
Definition cuda_internal.h:135
int major
Definition cuda_internal.h:132
size_t totalGlobalMem
Definition cuda_internal.h:136
int pci_deviceid
Definition cuda_internal.h:143
nvmlDevice_t nvml_dev
Definition cuda_internal.h:130
unsigned long numa_node_affinity[MAX_NUMA_NODE_LEN]
Definition cuda_internal.h:139
size_t c2c_bandwidth
Definition cuda_internal.h:144
int index
Definition cuda_internal.h:128
int minor
Definition cuda_internal.h:133
size_t nvswitch_bandwidth
Definition cuda_internal.h:146
CUuuid fabric_uuid
Definition cuda_internal.h:153
static const size_t MAX_NAME_LEN
Definition cuda_internal.h:134
static const size_t MAX_NUMA_NODE_LEN
Definition cuda_internal.h:137
Definition cuda_internal.h:576
Cuda::StreamAwareTaskFuncPtr stream_aware_fnptr
Definition cuda_internal.h:578
Processor::TaskFuncPtr fnptr
Definition cuda_internal.h:577
ByteArray user_data
Definition cuda_internal.h:579
Definition cuda_internal.h:282
GPUWorkStart * start
Definition cuda_internal.h:285
CUevent event
Definition cuda_internal.h:283
GPUWorkFence * fence
Definition cuda_internal.h:284
GPUCompletionNotification * notification
Definition cuda_internal.h:286
Definition cuda_internal.h:512
uintptr_t address_offset
Definition cuda_internal.h:517
NodeID owner
Definition cuda_internal.h:513
GPU * src_gpu
Definition cuda_internal.h:514
Memory mem
Definition cuda_internal.h:515
uintptr_t local_base
Definition cuda_internal.h:516
Definition cuda_internal.h:457
CUfunction func
Definition cuda_internal.h:458
int occ_num_threads
Definition cuda_internal.h:459
int occ_num_blocks
Definition cuda_internal.h:460
Definition cuda_internal.h:524
CUfunction fold_excl
Definition cuda_internal.h:528
CUfunction fold_nonexcl_transpose
Definition cuda_internal.h:535
CUfunction fold_excl_transpose
Definition cuda_internal.h:536
CUfunction apply_excl_advanced
Definition cuda_internal.h:530
CUfunction apply_excl_transpose
Definition cuda_internal.h:534
CUfunction apply_nonexcl
Definition cuda_internal.h:525
CUfunction apply_nonexcl_advanced
Definition cuda_internal.h:529
CUfunction fold_nonexcl
Definition cuda_internal.h:527
CUfunction apply_excl
Definition cuda_internal.h:526
CUfunction fold_nonexcl_advanced
Definition cuda_internal.h:531
CUfunction fold_excl_advanced
Definition cuda_internal.h:532
CUfunction apply_nonexcl_transpose
Definition cuda_internal.h:533
Definition cuda_internal.h:1028
void * host_proxy
Definition cuda_internal.h:1029
CUfunction GPU::GPUReductionOpEntry::* cache_field
Definition cuda_internal.h:1030
Definition cuda_memcpy.h:114
Definition cudart_hijack.h:65
Definition cudart_hijack.h:76
NodeID src
Definition ucp_internal.h:1