Realm
A distributed, event-based tasking library
Loading...
Searching...
No Matches
profiling.h
Go to the documentation of this file.
1/*
2 * Copyright 2025 Stanford University, NVIDIA Corporation
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18// profiling infrastructure for Realm tasks, copies, etc.
19
20#ifndef REALM_PROFILING_H
21#define REALM_PROFILING_H
22
23#include <limits.h>
24#include <vector>
25#include <set>
26#include <map>
27
28#include "realm/bytearray.h"
29#include "realm/processor.h"
30#include "realm/memory.h"
31#include "realm/instance.h"
32#include "realm/faults.h"
33
34namespace Realm {
35
36 // through the wonders of templates, users should never need to work with
37 // these IDs directly
39 {
40 PMID_OP_STATUS, // completion status of operation
41 PMID_OP_STATUS_ABNORMAL, // completion status only if abnormal
42 PMID_OP_BACKTRACE, // backtrace of a failed operation
43 PMID_OP_TIMELINE, // when task was ready, started, completed
44 PMID_OP_EVENT_WAITS, // intervals when operation is waiting on events
45 PMID_OP_PROC_USAGE, // processor used by task
46 PMID_OP_MEM_USAGE, // memories used by a copy
47 PMID_INST_STATUS, // "completion" status of an instance
48 PMID_INST_STATUS_ABNORMAL, // completion status only if abnormal
49 PMID_INST_ALLOCRESULT, // success/failure of instance allocation
50 PMID_INST_TIMELINE, // timeline for a physical instance
51 PMID_INST_MEM_USAGE, // memory and size used by an instance
52 PMID_PCTRS_CACHE_L1I, // L1 I$ performance counters
53 PMID_PCTRS_CACHE_L1D, // L1 D$ performance counters
54 PMID_PCTRS_CACHE_L2, // L2 D$ performance counters
55 PMID_PCTRS_CACHE_L3, // L3 D$ performance counters
56 PMID_PCTRS_IPC, // instructions/clocks performance counters
57 PMID_PCTRS_TLB, // TLB miss counters
58 PMID_PCTRS_BP, // branch predictor performance counters
59 PMID_OP_TIMELINE_GPU, // when a task was started and completed on the GPU
60 PMID_OP_SUBGRAPH_INFO, // identifying info for containing subgraph(s)
61 PMID_OP_FINISH_EVENT, // finish event for an operation
62 PMID_OP_COPY_INFO, // copy transfer details
63 // as the name suggests, this should always be last, allowing apps/runtimes
64 // sitting on top of Realm to use some of the ID space
66 };
67
68 namespace ProfilingMeasurements {
88
92
95
96 std::vector<uintptr_t> pcs;
97 std::vector<std::string> symbols;
98 };
99
102
103 // all times reported in nanoseconds from the start of program execution
104 // on some node. This is necessary because clients can't know where the
105 // measurement times were recorded and therefore have no reference. There
106 // may be skews between the start times of different nodes.
107 typedef long long timestamp_t;
108 static const timestamp_t INVALID_TIMESTAMP = LLONG_MIN;
109
117
118 timestamp_t create_time; // when was operation created?
119 timestamp_t ready_time; // when was operation ready to proceed?
120 timestamp_t start_time; // when did operation start?
121 timestamp_t end_time; // when did operation end (on processor)?
122 timestamp_t complete_time; // when was all work for operation complete?
123
124 inline void record_create_time(void);
125 inline void record_ready_time(void);
126 inline void record_start_time(void);
127 inline void record_end_time(void);
128 inline void record_complete_time(void);
129 inline bool is_valid(void) const;
130 };
131
134
135 // all times reported in nanoseconds from the start of program execution
136 // on some node. This is necessary because clients can't know where the
137 // measurement times were recorded and therefore have no reference. There
138 // may be skews between the start times of different nodes.
139 typedef long long timestamp_t;
140 static const timestamp_t INVALID_TIMESTAMP = LLONG_MIN;
141
146 timestamp_t start_time; // when was the GPU started?
147 timestamp_t end_time; // when was the GPU completed?
148
149 inline bool is_valid(void) const;
150 };
151
152 // records time intervals in which the operation was waiting on events
155
156 typedef long long timestamp_t;
157 static const timestamp_t INVALID_TIMESTAMP = LLONG_MIN;
158
160 timestamp_t wait_start; // when did the interval begin?
161 timestamp_t wait_ready; // when did the event trigger?
162 timestamp_t wait_end; // when did the interval actually end
163 Event wait_event; // which event was waited on
164
165 inline void record_wait_start(void);
166 inline void record_wait_ready(void);
167 inline void record_wait_end(void);
168 };
169
170 std::vector<WaitInterval> intervals;
171 };
172
173 // Track processor used for tasks
178
179 // Track memories used for copies
186
187 // Track transfer details for copies
190 // for each request create this
197
198 struct InstInfo {
199 std::vector<RegionInstance> src_insts; // src instances
200 std::vector<RegionInstance> dst_insts; // dst instances
201 RegionInstance src_indirection_inst; // src indirection instance (gather)
202 RegionInstance dst_indirection_inst; // dst indirection instance (scatter)
203 std::vector<FieldID> src_fields; // src fields
204 std::vector<FieldID> dst_fields; // dst fields
205 FieldID src_indirection_field; // field of indirection points
206 FieldID dst_indirection_field; // field of indirection points
207 RequestType request_type; // fill, reduce, copy
208 unsigned int num_hops; // num_hops for each request
209 };
210 std::vector<InstInfo> inst_info;
211 };
212
217
220 // TODO: probably can do something more useful here
221 // finish events listed from inside out (i.e. [0] is immediately
222 // containing subgraph)
223 std::vector<Event> subgraph_finish_events;
224 };
225
226 // Track the status of an instance
246
250
251 // simple boolean indicating whether or not allocation is expected to
252 // succeed
258
259 // Track the timeline of an instance
262
263 // all times reported in nanoseconds from the start of program execution
264 // on some node. This is necessary because clients can't know where the
265 // measurement times were recorded and therefore have no reference. There
266 // may be skews between the start times of different nodes.
267 typedef unsigned long long timestamp_t;
269
271 timestamp_t create_time; // when was instance created?
272 timestamp_t ready_time; // when was instance ready for use?
273 timestamp_t delete_time; // when was the instance deleted?
274
275 inline void record_create_time(void);
276 inline void record_ready_time(void);
277 inline void record_delete_time(void);
278 };
279
280 // Track properties of an instance
287
288 // Processor cache stats
289 template <ProfilingMeasurementID _ID>
291 static const ProfilingMeasurementID ID = _ID;
292 long long accesses;
293 long long misses;
294 };
295
300
301 // instructions/cycles
304 long long total_insts;
305 long long total_cycles;
306 long long fp_insts;
307 long long ld_insts;
308 long long st_insts;
309 long long br_insts;
310 };
311
314 long long inst_misses;
315 long long data_misses;
316 };
317
324 }; // namespace ProfilingMeasurements
325
327 public:
328 ProfilingRequest(Processor _response_proc, Processor::TaskFuncID _response_task_id,
329 int _priority = 0, bool _report_if_empty = false);
331
333
335
336 ProfilingRequest &add_user_data(const void *payload, size_t payload_size);
337
338 template <typename T>
340
343 add_measurements(const std::set<ProfilingMeasurementID> &measurement_ids);
344
345 template <typename S>
347
348 protected:
350
351 template <typename S>
352 friend bool serialize(S &s, const ProfilingRequest &pr);
353
359 std::set<ProfilingMeasurementID> requested_measurements;
360 };
361
362 // manages a set of profiling requests attached to a Realm operation
364 public:
367
369
371
373 Processor::TaskFuncID response_task_id,
374 const void *payload = 0, size_t payload_size = 0,
375 int priority = 0, bool report_if_empty = false);
376
377 size_t request_count(void) const;
378 bool empty(void) const;
379
380 void clear(void);
381
382 protected:
384
385 template <typename S>
386 friend bool serialize(S &s, const ProfilingRequestSet &prs);
387 template <typename S>
388 friend bool deserialize(S &s, ProfilingRequestSet &prs);
389
390 std::vector<ProfilingRequest *> requests;
391 };
392
394 public:
397
400 void clear(void);
401
402 // clears only recorded measurements (keeps request info)
404
405 template <typename T>
406 bool wants_measurement(void) const;
407
408 template <typename T>
409 void add_measurement(const T &data, bool send_complete_responses = true);
410
411 protected:
412 void send_response(const ProfilingRequest &pr) const;
413
414 // in order to efficiently send responses as soon as we have all the requested
415 // measurements, we
416 // need to know which profiling requests are needed by a given measurement and how
417 // many more measurements each request wants
418 std::map<ProfilingMeasurementID, std::vector<const ProfilingRequest *>>
420 std::map<const ProfilingRequest *, int> measurements_left;
421 bool completed_requests_present; // set if a request is completed but could not be
422 // sent right away
423
424 std::map<ProfilingMeasurementID, ByteArray> measurements;
425 };
426
428 public:
429 // responses need to be deserialized from the response task's argument data
430 ProfilingResponse(const void *_data, size_t _data_size);
432
433 const void *user_data(void) const;
434 size_t user_data_size(void) const;
435
436 // even if a measurement was requested, it may not have been performed - use
437 // this to check
438 template <typename T>
439 bool has_measurement(void) const;
440
441 // extracts a measurement (if available), returning a dynamically allocated result -
442 // caller should delete it when done
443 template <typename T>
444 T *get_measurement(void) const;
445
446 // extracts a measurement (if available), filling in a caller-allocated
447 // result - returns true if result available, false if not
448 template <typename T>
449 bool get_measurement(T &result) const;
450
451 protected:
452 const char *data;
453 size_t data_size;
456 const int *ids;
457
459 bool find_id(int id, int &offset, int &size) const;
460 };
461
462}; // namespace Realm
463
464#include "realm/profiling.inl"
465
466#endif // ifdef REALM_PROFILING_H
Definition bytearray.h:53
Definition event.h:50
Definition id.h:30
Definition memory.h:33
Definition processor.h:37
::realm_task_func_id_t TaskFuncID
Definition processor.h:58
Definition profiling.h:393
std::map< ProfilingMeasurementID, ByteArray > measurements
Definition profiling.h:424
std::map< const ProfilingRequest *, int > measurements_left
Definition profiling.h:420
void send_responses(const ProfilingRequestSet &prs)
void send_response(const ProfilingRequest &pr) const
bool completed_requests_present
Definition profiling.h:421
void import_requests(const ProfilingRequestSet &prs)
void add_measurement(const T &data, bool send_complete_responses=true)
std::map< ProfilingMeasurementID, std::vector< const ProfilingRequest * > > requested_measurements
Definition profiling.h:419
Definition profiling.h:363
friend bool serialize(S &s, const ProfilingRequestSet &prs)
ProfilingRequest & add_request(Processor response_proc, Processor::TaskFuncID response_task_id, const void *payload=0, size_t payload_size=0, int priority=0, bool report_if_empty=false)
friend bool deserialize(S &s, ProfilingRequestSet &prs)
std::vector< ProfilingRequest * > requests
Definition profiling.h:390
ProfilingRequestSet(const ProfilingRequestSet &to_copy)
ProfilingRequestSet & operator=(const ProfilingRequestSet &rhs)
bool empty(void) const
size_t request_count(void) const
Definition profiling.h:326
ProfilingRequest & add_measurement(ProfilingMeasurementID measurement_id)
ProfilingRequest & add_user_data(const void *payload, size_t payload_size)
ProfilingRequest & operator=(const ProfilingRequest &rhs)
Processor response_proc
Definition profiling.h:354
ProfilingRequest & add_measurements(const std::set< ProfilingMeasurementID > &measurement_ids)
int priority
Definition profiling.h:356
friend bool serialize(S &s, const ProfilingRequest &pr)
Processor::TaskFuncID response_task_id
Definition profiling.h:355
ProfilingRequest & add_measurement(void)
bool report_if_empty
Definition profiling.h:357
std::set< ProfilingMeasurementID > requested_measurements
Definition profiling.h:359
static ProfilingRequest * deserialize_new(S &s)
ProfilingRequest(Processor _response_proc, Processor::TaskFuncID _response_task_id, int _priority=0, bool _report_if_empty=false)
ProfilingRequest(const ProfilingRequest &to_copy)
ByteArray user_data
Definition profiling.h:358
Definition profiling.h:427
T * get_measurement(void) const
REALM_INTERNAL_API_EXTERNAL_LINKAGE bool find_id(int id, int &offset, int &size) const
size_t user_data_offset
Definition profiling.h:455
size_t data_size
Definition profiling.h:453
bool has_measurement(void) const
size_t user_data_size(void) const
bool get_measurement(T &result) const
int measurement_count
Definition profiling.h:454
const int * ids
Definition profiling.h:456
const char * data
Definition profiling.h:452
ProfilingResponse(const void *_data, size_t _data_size)
const void * user_data(void) const
Definition instance.h:66
#define REALM_INTERNAL_API_EXTERNAL_LINKAGE
Definition compiler_support.h:218
#define REALM_PUBLIC_API
Definition compiler_support.h:217
CachePerfCounters< PMID_PCTRS_CACHE_L2 > L2CachePerfCounters
Definition profiling.h:298
CachePerfCounters< PMID_PCTRS_CACHE_L1I > L1ICachePerfCounters
Definition profiling.h:296
CachePerfCounters< PMID_PCTRS_CACHE_L1D > L1DCachePerfCounters
Definition profiling.h:297
CachePerfCounters< PMID_PCTRS_CACHE_L3 > L3CachePerfCounters
Definition profiling.h:299
Definition activemsg.h:38
realm_field_id_t FieldID
Definition instance.h:45
ProfilingMeasurementID
Definition profiling.h:39
@ PMID_OP_SUBGRAPH_INFO
Definition profiling.h:60
@ PMID_OP_TIMELINE
Definition profiling.h:43
@ PMID_PCTRS_IPC
Definition profiling.h:56
@ PMID_REALM_LAST
Definition profiling.h:65
@ PMID_PCTRS_CACHE_L1D
Definition profiling.h:53
@ PMID_PCTRS_TLB
Definition profiling.h:57
@ PMID_PCTRS_CACHE_L1I
Definition profiling.h:52
@ PMID_OP_BACKTRACE
Definition profiling.h:42
@ PMID_INST_MEM_USAGE
Definition profiling.h:51
@ PMID_INST_STATUS_ABNORMAL
Definition profiling.h:48
@ PMID_PCTRS_CACHE_L2
Definition profiling.h:54
@ PMID_OP_MEM_USAGE
Definition profiling.h:46
@ PMID_PCTRS_BP
Definition profiling.h:58
@ PMID_INST_TIMELINE
Definition profiling.h:50
@ PMID_OP_COPY_INFO
Definition profiling.h:62
@ PMID_PCTRS_CACHE_L3
Definition profiling.h:55
@ PMID_OP_STATUS_ABNORMAL
Definition profiling.h:41
@ PMID_OP_TIMELINE_GPU
Definition profiling.h:59
@ PMID_OP_FINISH_EVENT
Definition profiling.h:61
@ PMID_INST_STATUS
Definition profiling.h:47
@ PMID_OP_STATUS
Definition profiling.h:40
@ PMID_OP_PROC_USAGE
Definition profiling.h:45
@ PMID_OP_EVENT_WAITS
Definition profiling.h:44
@ PMID_INST_ALLOCRESULT
Definition profiling.h:49
long long taken_branches
Definition profiling.h:321
long long total_branches
Definition profiling.h:320
long long mispredictions
Definition profiling.h:322
long long accesses
Definition profiling.h:292
long long misses
Definition profiling.h:293
long long st_insts
Definition profiling.h:308
long long total_cycles
Definition profiling.h:305
long long fp_insts
Definition profiling.h:306
long long ld_insts
Definition profiling.h:307
long long total_insts
Definition profiling.h:304
long long br_insts
Definition profiling.h:309
size_t bytes
Definition profiling.h:285
RegionInstance instance
Definition profiling.h:283
Memory memory
Definition profiling.h:284
Result result
Definition profiling.h:242
@ AWAITING_ALLOCATION
Definition profiling.h:232
@ INSTANCE_COUNT_EXCEEDED
Definition profiling.h:239
@ MEMORY_LOST
Definition profiling.h:238
@ CORRUPTED
Definition profiling.h:237
@ ALLOCATED
Definition profiling.h:235
@ FAILED_ALLOCATION
Definition profiling.h:233
@ CANCELLED_ALLOCATION
Definition profiling.h:234
@ DESTROYED_SUCCESSFULLY
Definition profiling.h:236
ByteArray error_details
Definition profiling.h:244
int error_code
Definition profiling.h:243
RegionInstance instance
Definition profiling.h:270
static const timestamp_t INVALID_TIMESTAMP
Definition profiling.h:268
timestamp_t delete_time
Definition profiling.h:273
timestamp_t create_time
Definition profiling.h:271
timestamp_t ready_time
Definition profiling.h:272
unsigned long long timestamp_t
Definition profiling.h:267
std::vector< uintptr_t > pcs
Definition profiling.h:96
std::vector< std::string > symbols
Definition profiling.h:97
RegionInstance src_indirection_inst
Definition profiling.h:201
RequestType request_type
Definition profiling.h:207
std::vector< FieldID > src_fields
Definition profiling.h:203
std::vector< RegionInstance > src_insts
Definition profiling.h:199
unsigned int num_hops
Definition profiling.h:208
std::vector< RegionInstance > dst_insts
Definition profiling.h:200
FieldID src_indirection_field
Definition profiling.h:205
std::vector< FieldID > dst_fields
Definition profiling.h:204
RegionInstance dst_indirection_inst
Definition profiling.h:202
FieldID dst_indirection_field
Definition profiling.h:206
std::vector< InstInfo > inst_info
Definition profiling.h:210
long long timestamp_t
Definition profiling.h:156
std::vector< WaitInterval > intervals
Definition profiling.h:170
static const timestamp_t INVALID_TIMESTAMP
Definition profiling.h:157
Event finish_event
Definition profiling.h:215
Memory source
Definition profiling.h:182
Memory target
Definition profiling.h:183
Processor proc
Definition profiling.h:176
ByteArray error_details
Definition profiling.h:86
Result result
Definition profiling.h:84
@ COMPLETED_WITH_ERRORS
Definition profiling.h:75
@ TERMINATED_EARLY
Definition profiling.h:78
@ INTERRUPT_REQUESTED
Definition profiling.h:77
@ COMPLETED_SUCCESSFULLY
Definition profiling.h:74
int error_code
Definition profiling.h:85
std::vector< Event > subgraph_finish_events
Definition profiling.h:223
timestamp_t end_time
Definition profiling.h:147
OperationTimelineGPU()
Definition profiling.h:142
timestamp_t start_time
Definition profiling.h:146
static const timestamp_t INVALID_TIMESTAMP
Definition profiling.h:140
long long timestamp_t
Definition profiling.h:139
timestamp_t end_time
Definition profiling.h:121
static const timestamp_t INVALID_TIMESTAMP
Definition profiling.h:108
timestamp_t start_time
Definition profiling.h:120
OperationTimeline()
Definition profiling.h:110
timestamp_t ready_time
Definition profiling.h:119
timestamp_t complete_time
Definition profiling.h:122
long long timestamp_t
Definition profiling.h:107
timestamp_t create_time
Definition profiling.h:118
long long data_misses
Definition profiling.h:315
long long inst_misses
Definition profiling.h:314