Realm
A distributed, event-based tasking library
Loading...
Searching...
No Matches
caching_allocator.h
Go to the documentation of this file.
1/*
2 * Copyright 2025 Stanford University, NVIDIA Corporation
3 * SPDX-License-Identifier: Apache-2.0
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef REALM_CACHING_ALLOCATOR_H
19#define REALM_CACHING_ALLOCATOR_H
20
21#include "realm/atomics.h"
23#include "realm/lists.h"
24#include "realm/mutex.h"
25
26#include <memory>
27
28namespace Realm {
29
30 template <typename T, size_t N>
32 public:
33 typedef T value_type;
34 enum
35 {
36 BLOCK_SIZE = N
37 };
38
39 private:
40 struct Block {
41 struct Chunk {
42 alignas(T) char data[sizeof(T)];
44 // Index in the block list.
45 // TODO: remove if the blocks are aligned properly
46 size_t idx;
48 };
49
50 atomic<ssize_t> num_alloced_chunks;
51 Chunk chunks[BLOCK_SIZE];
52 atomic<Chunk *> free_chunk_head;
53
54 IntrusiveListLink<Block> block_link;
55 REALM_PMTA_DEFN(Block, IntrusiveListLink<Block>, block_link);
56 struct BlockList
57 : public IntrusiveList<Block, REALM_PMTA_USE(Block, block_link), Mutex> {
59 {
60 // This should only be called on process exit, and is only here to clean
61 // up asan.
62 Block *ptr = this->head.next;
63 while(ptr != nullptr) {
64 Block *p = ptr->block_link.next;
65 ptr->block_link.next = nullptr;
66 delete ptr;
67 ptr = p;
68 }
69 this->head.next = nullptr;
70 }
71 };
72
73 Block()
74 : num_alloced_chunks(0)
75 , free_chunk_head(&chunks[0])
76 , block_link()
77 {
78 for(size_t i = 0; i < BLOCK_SIZE - 1; i++) {
79 chunks[i].chunk_link.next = &chunks[i + 1];
80 chunks[i].idx = i;
81 }
82 chunks[BLOCK_SIZE - 1].idx = BLOCK_SIZE - 1;
83 chunks[BLOCK_SIZE - 1].chunk_link.next = nullptr;
84 }
85
86 ~Block()
87 {
88 assert((num_alloced_chunks.load() == 0) && "Not all chunks have been freed!");
89 // If we're part of the global list, make sure to clean up the rest of the
90 // list (only done on process exit)
91 if(block_link.next != nullptr) {
92 delete block_link.next;
93 }
94 }
95
96 void *alloc_obj()
97 {
98 Chunk *old_head = nullptr;
99 ssize_t expected_full_size = BLOCK_SIZE;
100
101 if(num_alloced_chunks.compare_exchange(expected_full_size, 0)) {
102 // All full up, the block is now flagged for reclaimation
103 return nullptr;
104 } else {
105 assert((expected_full_size >= 0) &&
106 "Tried to allocate from a block marked for reclaimation!");
107 }
108
109 old_head = free_chunk_head.load_acquire();
110 assert((old_head != nullptr) && "Non-empty block with no free chunks!");
111
112 // Only the owning thread can pop off the free list, so no ABA issue here
113 // and we know from num_alloced_chunks that there must at least be one
114 // item in the block (old_head cannot be null)
115 Chunk *next = nullptr;
116 do {
117 next = old_head->chunk_link.next;
118 } while(!free_chunk_head.compare_exchange_weak(old_head, next));
119
120 old_head->chunk_link.next = nullptr;
121 num_alloced_chunks.fetch_add_acqrel(1);
122
123 return &old_head->data;
124 }
125 bool free_obj(void *p)
126 {
127 Chunk *chunk = get_chunk_from_obj(p);
128 Chunk *old_head = free_chunk_head.load_acquire();
129 // Multiple threads can push onto the free list at once
130 do {
131 chunk->chunk_link.next = old_head;
132 } while(!free_chunk_head.compare_exchange_weak(old_head, chunk));
133 // If this block was flagged for reclaimation, then its allocated chunks
134 // are negative and it'll be empty if instead of zero, the number of
135 // allocated blocks is -BLOCK_SIZE. Tell the caller so it can append the
136 // block to the global block list
137 return (num_alloced_chunks.fetch_sub_acqrel(1) - 1) == -BLOCK_SIZE;
138 }
139
140 static Chunk *get_chunk_from_obj(void *p)
141 {
142 return reinterpret_cast<Chunk *>(reinterpret_cast<uintptr_t>(p) -
143 offsetof(Chunk, data));
144 }
145
146 static Block *get_block_from_obj(void *p)
147 {
148 // TODO: replace with pointer arithmetic using the alignment of the block
149 // class.
150 Chunk *chunk = get_chunk_from_obj(p);
151 return reinterpret_cast<Block *>(reinterpret_cast<uintptr_t>(chunk - chunk->idx) -
152 offsetof(Block, chunks));
153 }
154 };
155
156 static typename Block::BlockList free_blocks;
157
158 // When a thread exits, we want to make sure it's current block is marked for
159 // reclaimation rather than deleted
160 static void release_block(Block *blk)
161 {
162 ssize_t old_num = blk->num_alloced_chunks.load();
163 while(old_num > 0 &&
164 !blk->num_alloced_chunks.compare_exchange_weak(old_num, -old_num))
165 ;
166 // Delete the block if there aren't any outstanding references
167 if(old_num == 0) {
168 delete blk;
169 }
170 }
171
172 public:
173 static void *alloc_obj()
174 {
175 // We use "thread_local" (since C++11) instead of thread_local here to
176 // leverage C++ TLS, which works with C++ constructors and destructors
177 static thread_local std::unique_ptr<Block, decltype(&release_block)> current_block(
178 nullptr, release_block);
179 void *obj = nullptr;
180
181 if(current_block != nullptr) {
182 obj = current_block->alloc_obj();
183 }
184
185 if(obj == nullptr) {
186 Block *newblk = free_blocks.pop_front();
187 if(newblk == nullptr) {
188 newblk = new(std::nothrow) Block;
189 }
190 if(newblk != nullptr) {
191 obj = newblk->alloc_obj();
192 assert((obj != nullptr) && "Newly acquired block can't allocate!");
193 current_block.release();
194 current_block.reset(newblk);
195 }
196 }
197
198 return obj;
199 }
200 static void free_obj(void *p)
201 {
202 Block *block = Block::get_block_from_obj(p);
203 if(block->free_obj(p)) {
204 // This block is empty and ready for reuse!
205 // Push to the front for better cache locality!
206 block->num_alloced_chunks.store_release(0);
207 free_blocks.push_front(block);
208 }
209 }
210 };
211
212 template <typename T, size_t N>
213 typename CachingAllocator<T, N>::Block::BlockList CachingAllocator<T, N>::free_blocks;
214
215} // namespace Realm
216
217#endif // ifndef REALM_CACHING_ALLOCATOR_H
Definition caching_allocator.h:31
static void free_obj(void *p)
Definition caching_allocator.h:200
static void * alloc_obj()
Definition caching_allocator.h:173
T value_type
Definition caching_allocator.h:33
@ BLOCK_SIZE
Definition caching_allocator.h:36
Definition lists.h:66
IntrusiveListLink< Block > head
Definition lists.h:97
Definition atomics.h:31
T load(void) const
bool compare_exchange_weak(T &expected, T newval)
T fetch_add_acqrel(T to_add)
bool compare_exchange(T &expected, T newval)
T load_acquire(void) const
T fetch_sub_acqrel(T to_sub)
#define REALM_PMTA_DEFN(structtype, membertype, name)
Definition lists.h:40
Definition activemsg.h:38
Definition caching_allocator.h:57
~BlockList(void)
Definition caching_allocator.h:58
Definition caching_allocator.h:41
IntrusiveListLink< Chunk > chunk_link
Definition caching_allocator.h:43
char data[sizeof(T)]
Definition caching_allocator.h:42
REALM_PMTA_DEFN(Chunk, IntrusiveListLink< Chunk >, chunk_link)
size_t idx
Definition caching_allocator.h:46