Realm
A distributed, event-based tasking library
Loading...
Searching...
No Matches
operation.h
Go to the documentation of this file.
1/*
2 * Copyright 2025 Stanford University, NVIDIA Corporation
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef REALM_OPERATION_H
19#define REALM_OPERATION_H
20
21#include "realm/profiling.h"
22#include "realm/event_impl.h"
23#include "realm/atomics.h"
24
25#include "realm/network.h"
26
27#include <set>
28#include <iostream>
29
30namespace Realm {
31
32 class Operation {
33 protected:
34 // must be subclassed
35 Operation(GenEventImpl *_finish_event, EventImpl::gen_t _finish_gen,
36 const ProfilingRequestSet &_requests);
37
38 // can't destroy directly either - done when last reference is removed
39 // (subclasses may still override the method - just not call it directly)
40 virtual ~Operation(void);
41
42 public:
43 void add_reference(void);
44 void remove_reference(void);
45
46 // marks operation ready - returns true if it should be enqueued for execution
47 // (i.e. it hasn't been cancelled)
48 virtual bool mark_ready(void);
49
50 // marks operation started - returns true if successful, false if a cancellation
51 // request has arrived
52 virtual bool mark_started(void);
53
54 virtual void mark_finished(bool successful);
55 virtual void mark_terminated(int error_code, const ByteArray &details);
56
57 // returns true if its able to perform the cancellation (or if nothing can be done)
58 // returns false if a subclass wants to try some other means to cancel an operation
59 virtual bool attempt_cancellation(int error_code, const void *reason_data,
60 size_t reason_size);
61
62 virtual void set_priority(int new_priority);
63
64 // a common reason for cancellation is a poisoned precondition - this helper takes
65 // care
66 // of recording the error code and marking the operation as (unsuccessfully) finished
68
69 bool cancellation_requested(void) const;
70
71 virtual void print(std::ostream &os) const = 0;
72
73 // abstract class to describe asynchronous work started by an operation
74 // that must finish for the operation to become "complete"
76 public:
78 virtual ~AsyncWorkItem(void);
79
80 virtual void mark_finished(bool successful);
81
82 virtual void request_cancellation(void) = 0;
83
84 virtual void print(std::ostream &os) const = 0;
85
86 protected:
88
89 // the next_item field is effectively owned by the Operation class
90 friend class Operation;
91 friend std::ostream &operator<<(std::ostream &os, Operation *op);
92
94 };
95
96 // once added, the item belongs to the operation (i.e. will be deleted with
97 // the operation)
99
100 // used to record event wait intervals, if desired
103
104 // used to measure when device-side work starts for a gpu task
107 void add_gpu_work_start(uint64_t timestamp);
108 void add_gpu_work_end(uint64_t timestamp);
109
110 protected:
111 // called by AsyncWorkItem::mark_finished from an arbitrary thread
112 void work_item_finished(AsyncWorkItem *item, bool successful);
113 virtual void mark_completed(void);
114
115 void clear_profiling(void);
117
118 void trigger_finish_event(bool poisoned);
119
121
125
126 public:
129
130 protected:
133
134 // allow operations to lazily update their state
136
146
147 // append-only list (until Operation destruction)
149 atomic<int> pending_work_items; // uses atomics so we don't have to take lock to check
151
152 friend std::ostream &operator<<(std::ostream &os, Operation *op);
153 };
154
156 public:
159
160 // Operations are 'owned' by the table - the table will free them once it
161 // gets the completion event for it
162 void add_local_operation(Event finish_event, Operation *local_op);
163 void add_remote_operation(Event finish_event, int remote_note);
164
165 bool request_cancellation(Event finish_event, const void *reason_data,
166 size_t reason_size);
167
168 void set_priority(Event finish_event, int new_priority);
169
170 void print_operations(std::ostream &os);
171
172 static void register_handlers(void);
173
174 // checks that all operations have finished before shutdown
175 void shutdown_check(void);
176
177 protected:
179
180#if 0
181 class TableCleaner : public EventWaiter {
182 public:
183 TableCleaner(OperationTable *_table);
184 virtual bool event_triggered(bool poisoned);
185 virtual void print(std::ostream& os) const;
186 virtual Event get_finish_event(void) const;
187
188 protected:
189 OperationTable *table;
190 };
191#endif
192
193 struct TableEntry : public EventWaiter {
194 virtual void event_triggered(bool poisoned, TimeLimit work_until);
195 virtual void print(std::ostream &os) const;
196 virtual Event get_finish_event(void) const;
197
205 };
206 typedef std::map<Event, TableEntry> Table;
207
208#ifdef REALM_USE_OPERATION_TABLE
209 // event table is protected by a mutex
210 // try to avoid a serial bottleneck by splitting events over 4 different tables
211 static const int NUM_TABLES = 4;
212
213 Mutex mutexes[NUM_TABLES];
214 Table tables[NUM_TABLES];
215 // TableCleaner cleaner;
216#endif
217 };
218
221
222 static void handle_message(NodeID sender, const CancelOperationMessage &msg,
223 const void *data, size_t datalen);
224 };
225
226}; // namespace Realm
227
228#include "realm/operation.inl"
229
230#endif // REALM_OPERATION_H
Definition bytearray.h:53
unsigned gen_t
Definition event_impl.h:87
Definition event_impl.h:49
Definition event.h:50
Definition event_impl.h:198
Definition operation.h:155
void print_operations(std::ostream &os)
void add_local_operation(Event finish_event, Operation *local_op)
std::map< Event, TableEntry > Table
Definition operation.h:206
static void register_handlers(void)
void add_remote_operation(Event finish_event, int remote_note)
void set_priority(Event finish_event, int new_priority)
void shutdown_check(void)
void event_triggered(Event e)
bool request_cancellation(Event finish_event, const void *reason_data, size_t reason_size)
Definition operation.h:75
AsyncWorkItem * next_item
Definition operation.h:93
virtual void mark_finished(bool successful)
virtual void print(std::ostream &os) const =0
Operation * op
Definition operation.h:87
friend std::ostream & operator<<(std::ostream &os, Operation *op)
virtual void request_cancellation(void)=0
Definition operation.h:32
GenEventImpl * finish_event
Definition operation.h:122
virtual bool mark_ready(void)
void clear_profiling(void)
void add_gpu_work_end(uint64_t timestamp)
bool wants_event_waits
Definition operation.h:142
Operation(GenEventImpl *_finish_event, EventImpl::gen_t _finish_gen, const ProfilingRequestSet &_requests)
virtual void print(std::ostream &os) const =0
void add_async_work_item(AsyncWorkItem *item)
virtual bool attempt_cancellation(int error_code, const void *reason_data, size_t reason_size)
ProfilingMeasurements::OperationTimelineGPU timeline_gpu
Definition operation.h:141
atomic< int > pending_work_items
Definition operation.h:149
void trigger_finish_event(bool poisoned)
void add_finish_event_precondition(Event precondition)
virtual void handle_poisoned_precondition(Event pre)
virtual bool mark_started(void)
void add_reference(void)
ProfilingMeasurements::OperationStatus Status
Definition operation.h:131
atomic< AsyncWorkItem * > all_work_items
Definition operation.h:148
bool wants_timeline
Definition operation.h:138
ProfilingRequestSet requests
Definition operation.h:144
void remove_reference(void)
ProfilingMeasurements::OperationEventWaits waits
Definition operation.h:143
bool cancellation_requested(void) const
void work_item_finished(AsyncWorkItem *item, bool successful)
void add_gpu_work_start(uint64_t timestamp)
atomic< Status::Result > state
Definition operation.h:132
atomic< int > refcount
Definition operation.h:124
atomic< int > failed_work_items
Definition operation.h:150
friend std::ostream & operator<<(std::ostream &os, Operation *op)
ProfilingMeasurementCollection measurements
Definition operation.h:145
void reconstruct_measurements()
bool wants_gpu_timeline
Definition operation.h:140
virtual void mark_terminated(int error_code, const ByteArray &details)
virtual void mark_completed(void)
virtual ~Operation(void)
virtual void set_priority(int new_priority)
Event get_finish_event(void) const
ProfilingMeasurements::OperationStatus status
Definition operation.h:137
void mark_gpu_work_start()
void send_profiling_data(void)
EventImpl::gen_t finish_gen
Definition operation.h:123
virtual void mark_finished(bool successful)
ProfilingMeasurements::OperationTimeline timeline
Definition operation.h:139
virtual Status::Result get_state(void)
ProfilingMeasurements::OperationEventWaits::WaitInterval * create_wait_interval(Event e)
bool wants_gpu_work_start() const
Definition profiling.h:393
Definition profiling.h:363
Definition timers.h:129
Definition mutex.h:223
Definition atomics.h:31
Definition activemsg.h:38
int NodeID
Definition nodeset.h:40
Definition operation.h:219
static void handle_message(NodeID sender, const CancelOperationMessage &msg, const void *data, size_t datalen)
Event finish_event
Definition operation.h:220
Definition operation.h:193
virtual void event_triggered(bool poisoned, TimeLimit work_until)
Operation * local_op
Definition operation.h:200
Event finish_event
Definition operation.h:199
bool pending_cancellation
Definition operation.h:202
virtual Event get_finish_event(void) const
OperationTable * table
Definition operation.h:198
void * reason_data
Definition operation.h:203
int remote_node
Definition operation.h:201
size_t reason_size
Definition operation.h:204
virtual void print(std::ostream &os) const